Python DefaultDocumentFormatter Examples

Programming Language: Python

Namespace/Package Name: mongo_connector.doc_managers.formatters

Examples at hotexamples.com: 27

Python DefaultDocumentFormatter - 27 examples found. These are the top rated real world Python examples of mongo_connector.doc_managers.formatters.DefaultDocumentFormatter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DefaultDocumentFormatter(16)

format_document(10)

transform_value(1)

Example #1

Show file

File: elastic_doc_manager.py Project: sha1sum/elastic-doc-manager

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        client_options = kwargs.get('clientOptions', {})
        if 'aws' in kwargs:
            if _HAS_AWS is False:
                raise ConfigurationError('aws extras must be installed to sign Elasticsearch requests')
            aws_args = kwargs.get('aws', {'region': 'us-east-1'})
            aws = aws_session.Session()
            if 'access_id' in aws_args and 'secret_key' in aws_args:
                aws = aws_session.Session(
                    aws_access_key_id = aws_args['access_id'],
                    aws_secret_access_key = aws_args['secret_key'])
            credentials = aws.get_credentials()
            region = aws.region_name or aws_args['region']
            aws_auth = AWSV4Sign(credentials, region, 'es')
            client_options['http_auth'] = aws_auth
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = es_connection.RequestsHttpConnection
        self.elastic = Elasticsearch(
            hosts=[url], **client_options)
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

Example #2

Show file

File: neo4j_doc_manager.py Project: dsjennin/neo4j_doc_manager

 def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
   
   self.graph = Graph(url)
   self.auto_commit_interval = auto_commit_interval
   self.unique_key = unique_key
   self.chunk_size = chunk_size
   self._formatter = DefaultDocumentFormatter()
   self.kwargs = kwargs.get("clientOptions")

Example #3

Show file

File: elastic_doc_manager.py Project: gwaller/mongo-connector

 def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
              unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
     self.elastic = Elasticsearch(hosts=[url])
     self.auto_commit_interval = auto_commit_interval
     self.doc_type = 'string'  # default type is string, change if needed
     self.unique_key = unique_key
     self.chunk_size = chunk_size
     if self.auto_commit_interval not in [None, 0]:
         self.run_auto_commit()
     self._formatter = DefaultDocumentFormatter()

Example #4

Show file

File: neo4j_doc_manager.py Project: mayank-chutani/mongo-connector

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='uid', chunk_size=DEFAULT_MAX_BULK, **kwargs):

        self.graph = Graph(url)
        self.url = url
        self.auto_commit_interval = auto_commit_interval
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self._formatter = DefaultDocumentFormatter()
        self.kwargs = kwargs.get("clientOptions")
        self.authorization_token = base64.b64encode(os.getenv('NEO4J_AUTH'))

Example #5

Show file

File: elastic_doc_manager.py Project: izzui/mongo-connector

 def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
              unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
              meta_index_name="mongodb_meta", meta_type="mongodb_meta",
              **kwargs):
     self.elastic = Elasticsearch(hosts=[url])
     self.auto_commit_interval = auto_commit_interval
     self.meta_index_name = meta_index_name
     self.meta_type = meta_type
     self.unique_key = unique_key
     self.chunk_size = chunk_size
     if self.auto_commit_interval not in [None, 0]:
         self.run_auto_commit()
     self._formatter = DefaultDocumentFormatter()

Example #6

Show file

File: elastic2_doc_manager.py Project: mongodb-labs/elastic2-doc-manager

    def __init__(
        self,
        url,
        auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
        unique_key="_id",
        chunk_size=DEFAULT_MAX_BULK,
        meta_index_name="mongodb_meta",
        meta_type="mongodb_meta",
        attachment_field="content",
        **kwargs
    ):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]"
                )
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = True
            client_options["connection_class"] = es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(
            self, self.auto_send_interval, self.auto_commit_interval
        )
        self.auto_commiter.start()

Example #7

Show file

File: elastic2_doc_manager.py Project: luisobo/elastic2-doc-manager

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        self.elastic = self._create_elasticsearch_client(url, kwargs.get('clientOptions', {}))
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

Example #8

Show file

File: elastic2_doc_manager.py Project: sliwinski-milosz/elastic2-doc-manager

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        client_options = kwargs.get('clientOptions', {})
        client_options.setdefault('sniff_on_start', True)
        client_options.setdefault('sniff_on_connection_fail', True)
        client_options.setdefault('sniffer_timeout', 60)
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = Lock()

        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

Example #9

Show file

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        self.elastic = Elasticsearch(hosts=[url],
                                     **kwargs.get('clientOptions', {}))
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(),
                    doc_type=coll,
                    body={"_source": {
                        "enabled": True
                    }})

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                self.elastic.indices.delete_mapping(index=db.lower(),
                                                    doc_type=coll)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        index, doc_type = self._index_and_mapping(namespace)
        document = self.elastic.get(index=index,
                                    doc_type=doc_type,
                                    id=u(document_id))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        self.upsert(updated, namespace, timestamp)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {"ns": namespace, "_ts": timestamp}
        # Index the source document, using lowercase namespace as index name.
        self.elastic.index(index=index,
                           doc_type=doc_type,
                           body=self._formatter.format_document(doc),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Index document metadata with original namespace (mixed upper/lower).
        self.elastic.index(index=self.meta_index_name,
                           doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop('_id'))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {
                        "type": "attachment"
                    }
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {
            'ns': namespace,
            '_ts': timestamp,
        }

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        self.elastic.index(index=index,
                           doc_type=doc_type,
                           body=doc,
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        self.elastic.index(index=self.meta_index_name,
                           doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        self.elastic.delete(index=index,
                            doc_type=doc_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))
        self.elastic.delete(index=self.meta_index_name,
                            doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic,
                        query=kwargs.pop('body', None),
                        scroll='10m',
                        **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(index=self.meta_index_name,
                                   body={
                                       "query": {
                                           "filtered": {
                                               "filter": {
                                                   "range": {
                                                       "_ts": {
                                                           "gte": start_ts,
                                                           "lte": end_ts
                                                       }
                                                   }
                                               }
                                           }
                                       }
                                   })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(index=self.meta_index_name,
                                         body={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "_ts": "desc"
                                             }],
                                         },
                                         size=1)["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #10

Show file

File: elastic2_doc_manager.py Project: hongmi/elastic2-doc-manager

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = True
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)

        # es6 deprecates support for multiple document types
        # using default_type for consistency
        # Will try and use multiple doc types only if explicity specified
        self.create_multi_type = kwargs.get("createMultiType", False)
        self.default_type = kwargs.get("defaultType", "_doc")
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type if self.create_multi_type else self.default_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split(".", 1)
        return index.lower(), (self.default_type,
                               doc_type)[self.create_multi_type]

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commiter.join()
        self.auto_commit_interval = 0
        # Commit any remaining docs from buffer
        self.commit()

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            coll = (self.default_type, coll)[self.create_multi_type]
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(),
                    doc_type=coll,
                    body={"_source": {
                        "enabled": True
                    }})

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            coll = (self.default_type, coll)[self.create_multi_type]
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type="delete") for result in scan(
                        self.elastic, index=db.lower(), doc_type=coll)),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """

        index, doc_type = self._index_and_mapping(namespace)
        with self.lock:
            # Check if document source is stored in local buffer
            document = self.BulkBuffer.get_from_sources(
                index, doc_type, str(document_id))
        if document:
            # Document source collected from local buffer
            # Perform apply_update on it and then it will be
            # ready for commiting to Elasticsearch
            updated = self.apply_update(document, update_spec)
            # _id is immutable in MongoDB, so won't have changed in update
            updated["_id"] = document_id
            self.upsert(updated, namespace, timestamp)
        else:
            # Document source needs to be retrieved from Elasticsearch
            # before performing update. Pass update_spec to upsert function
            updated = {"_id": document_id}
            self.upsert(updated, namespace, timestamp, update_spec)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp, update_spec=None):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        metadata = {"ns": namespace, "_ts": timestamp}

        # Index the source document, using lowercase namespace as index name.
        action = {
            "_op_type": "index",
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": self._formatter.format_document(doc),
        }
        # Index document metadata with original namespace (mixed upper/lower).
        meta_action = {
            "_op_type": "index",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": doc_id,
            "_source": bson.json_util.dumps(metadata),
        }

        self.index(action, meta_action, doc, update_spec)

        # Leave _id, since it's part of the original document
        doc["_id"] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": namespace,
                        "_ts": timestamp
                    },
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop("_id"))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {
                        "type": "attachment"
                    }
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {"ns": namespace, "_ts": timestamp}

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        action = {
            "_op_type": "index",
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": doc,
        }
        meta_action = {
            "_op_type": "index",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": doc_id,
            "_source": bson.json_util.dumps(metadata),
        }

        self.index(action, meta_action)

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        action = {
            "_op_type": "delete",
            "_index": index,
            "_type": doc_type,
            "_id": str(document_id),
        }

        meta_action = {
            "_op_type": "delete",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": str(document_id),
        }

        self.index(action, meta_action)

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic,
                        query=kwargs.pop("body", None),
                        scroll="10m",
                        **kwargs):
            hit["_source"]["_id"] = hit["_id"]
            yield hit["_source"]

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "range": {
                        "_ts": {
                            "gte": start_ts,
                            "lte": end_ts
                        }
                    }
                }
            },
        )

    def index(self, action, meta_action, doc_source=None, update_spec=None):
        with self.lock:
            self.BulkBuffer.add_upsert(action, meta_action, doc_source,
                                       update_spec)

        # Divide by two to account for meta actions
        if (len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size
                or self.auto_commit_interval == 0):
            self.commit()

    def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug(
                        "Bulk request finished, successfully sent %d "
                        "operations",
                        successes,
                    )
                    if errors:
                        LOG.error("Bulk request finished with errors: %r",
                                  errors)
            except es_exceptions.ElasticsearchException:
                LOG.exception("Bulk request failed with exception")

    def commit(self):
        """Send buffered requests and refresh all indexes."""
        self.send_buffered_operations()
        retry_until_ok(self.elastic.indices.refresh, index="")

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {
                        "match_all": {}
                    },
                    "sort": [{
                        "_ts": "desc"
                    }]
                },
                size=1,
            )["hits"]["hits"]
            for r in result:
                r["_source"]["_id"] = r["_id"]
                return r["_source"]
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #11

Show file

class DocManager(DocManagerBase):
    """DLKit -> Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch. Massages the DLKit data to include repository ID and run info
    for assets
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        self.elastic = Elasticsearch(hosts=[url],
                                     **kwargs.get('clientOptions', {}))
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(),
                    doc_type=coll,
                    body={"_source": {
                        "enabled": True
                    }})

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                self.elastic.indices.delete_mapping(index=db.lower(),
                                                    doc_type=coll)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        index, doc_type = self._index_and_mapping(namespace)
        document = self.elastic.get(index=index,
                                    doc_type=doc_type,
                                    id=u(document_id))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        self.upsert(updated, namespace, timestamp)

        # [email protected]
        # if the update is to a repository.Composition, make sure the
        # assets listed in assetIds also have a reference to the
        # repositoryId / parent of the given composition.
        if ('edx-composition' in update_spec['genusTypeId']
                and 'assetIds' in update_spec):
            # get repositoryId and it's parent
            app_user = User(username='******', authenticated=True)
            dummy_request = create_test_request(app_user)
            activate_managers(dummy_request)
            rm = get_session_data(dummy_request, 'rm')
            run_repo_id = update_spec['repositoryId']
            run_repo = rm.get_repository(clean_id(run_repo_id))
            course_repo = rm.get_parent_repositories(run_repo.ident).next()
            domain_repo = rm.get_parent_repositories(course_repo.ident).next()

            # now get the assets referenced in update_spec['assetIds'] and
            # append the course_run_name to their docs
            for asset_id in update_spec['assetIds']:
                asset_doc_id = ObjectId(clean_id(asset_id).identifier)
                asset_namespace = '{0}.{1}'.format(index, 'Asset')
                asset_document = self.elastic.get(index=index,
                                                  doc_type='Asset',
                                                  id=u(asset_doc_id))

                if 'enclosedObjectId' in asset_document['_source']:
                    am = get_session_data(dummy_request, 'am')
                    bank = am.get_bank(
                        clean_id(asset_document['_source']['repositoryId']))
                    items = bank.get_assessment_items(
                        clean_id(
                            asset_document['_source']['enclosedObjectId']))
                    try:
                        item_text = ' '.join(
                            [i.get_text('edxml') for i in items])
                    except AttributeError:
                        item_text = ''
                else:
                    item_text = ' '.join([
                        ac['text']['text']
                        for ac in asset_document['_source']['assetContents']
                    ])
                full_text = '{0} {1} {2}'.format(
                    asset_document['_source']['displayName']['text'],
                    asset_document['_source']['description']['text'],
                    item_text)

                denormalized_asset = asset_document['_source'].copy()
                add_metadata(denormalized_asset, 'runs', str(run_repo.ident))
                add_metadata(denormalized_asset, 'courses',
                             str(course_repo.ident))
                add_metadata(denormalized_asset, 'domains',
                             str(domain_repo.ident))
                add_metadata(denormalized_asset, 'fullText', full_text)

                updated_asset = self.apply_update(asset_document['_source'],
                                                  denormalized_asset)
                # _id is immutable in MongoDB, so won't have changed in update
                updated_asset['_id'] = asset_document['_id']
                self.upsert(updated_asset, asset_namespace, timestamp)

            # also remove any runs, if asset removed from a composition
            if 'assetIds' in document['_source']:
                removed_assets = [
                    i for i in document['_source']['assetIds']
                    if i not in update_spec['assetIds']
                ]
                for asset_id in removed_assets:
                    asset_doc_id = ObjectId(clean_id(asset_id).identifier)
                    asset_namespace = '{0}.{1}'.format(index, 'Asset')
                    asset_document = self.elastic.get(index=index,
                                                      doc_type='Asset',
                                                      id=u(asset_doc_id))

                    denormalized_asset = asset_document['_source'].copy()
                    denormalized_asset['runs'].remove(str(run_repo.ident))
                    denormalized_asset['courses'].remove(str(
                        course_repo.ident))
                    denormalized_asset['domains'].remove(domain_repo.ident)

                    updated_asset = self.apply_update(
                        asset_document['_source'], denormalized_asset)
                    # _id is immutable in MongoDB, so won't have changed in update
                    updated_asset['_id'] = asset_document['_id']
                    self.upsert(updated_asset, asset_namespace, timestamp)

        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {"ns": namespace, "_ts": timestamp}
        # Index the source document, using lowercase namespace as index name.
        self.elastic.index(index=index,
                           doc_type=doc_type,
                           body=self._formatter.format_document(doc),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Index document metadata with original namespace (mixed upper/lower).
        self.elastic.index(index=self.meta_index_name,
                           doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop('_id'))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {
                        "type": "attachment"
                    }
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {
            'ns': namespace,
            '_ts': timestamp,
        }

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        self.elastic.index(index=index,
                           doc_type=doc_type,
                           body=doc,
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        self.elastic.index(index=self.meta_index_name,
                           doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        self.elastic.delete(index=index,
                            doc_type=doc_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))
        self.elastic.delete(index=self.meta_index_name,
                            doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic,
                        query=kwargs.pop('body', None),
                        scroll='10m',
                        **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(index=self.meta_index_name,
                                   body={
                                       "query": {
                                           "filtered": {
                                               "filter": {
                                                   "range": {
                                                       "_ts": {
                                                           "gte": start_ts,
                                                           "lte": end_ts
                                                       }
                                                   }
                                               }
                                           }
                                       }
                                   })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(index=self.meta_index_name,
                                         body={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "_ts": "desc"
                                             }],
                                         },
                                         size=1)["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #12

Show file

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        client_options = kwargs.get('clientOptions', {})
        client_options.setdefault('sniff_on_start', True)
        client_options.setdefault('sniff_on_connection_fail', True)
        client_options.setdefault('sniffer_timeout', 60)
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.routing = kwargs.get('routing', {})
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

    def _get_parent_field(self, index, doc_type):
        """Get the parent field name for this collection."""
        try:
            return self.routing[index][doc_type]['variant_id']
        except KeyError:
            return None

    def _is_child_type(self, index, doc_type):
        """Return True if this mapping type is a child"""
        return self._get_parent_field(index, doc_type) is not None

    def _get_parent_id_from_mongodb(self, index, doc_type, doc):
        """Get parent ID from doc"""
        parent_field = self._get_parent_field(index, doc_type)
        if parent_field is None:
            return None

        return self._formatter.transform_value(doc.pop(parent_field, None))

    def _get_parent_id_from_elastic(self, doc):
        """Get parent ID from doc"""
        return doc.get('_parent')

    def _search_doc_by_id(self, index, doc_type, doc_id):
        """Search document in Elasticsearch by _id"""
        result = self.elastic.search(index=index, doc_type=doc_type,
                                     body={
                                         'query': {
                                             'ids': {
                                                 'type': doc_type,
                                                 'values': [u(doc_id)]
                                             }
                                         }
                                     })
        if result['hits']['total'] == 1:
            return result['hits']['hits'][0]
        else:
            return None

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll,
                    body={
                        "_source": {"enabled": True}
                    })

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type='delete') for result in scan(
                        self.elastic, index=db.lower(), doc_type=coll)))
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        # index, doc_type = self._index_and_mapping(namespace)

        #generate custom document_id
        index, doc_type = self._index_and_mapping(namespace)
        if doc_type == "facility_variant":
            if document_id:
                # document = self.elastic.get(index="catalog", doc_type="variant", id=u(document_id))
                result = self.elastic.search(index="catalog", doc_type="variant",
                                     body={
                                            "query": {
                                                "match" : {"facility_variant_id" : u(document_id)}
                                            }
                                        })
                if result['hits']['total'] == 1:
                    document = result['hits']['hits'][0]
                    if "_source" in document:
                        elasticDoc  = document['_source']
                        if elasticDoc:
                            # import pdb; pdb.set_trace()
                            # variant_id = ObjectId(doc['variant_id'])
                            # variantDoc = m_variant.find_one({"_id" : variant_id})
                            if elasticDoc and "$set" in update_spec:
                                updatedValues = update_spec['$set']
                                for item in updatedValues:
                                    if item in elasticDoc:
                                        elasticDoc[str(item)] = updatedValues[item]
                            else:
                                elasticDoc['status'] = update_spec['status']
                                elasticDoc['comment'] = update_spec['comment']
                                elasticDoc['reason'] = update_spec['reason']
                                elasticDoc['is_available'] = update_spec['is_available']
                                elasticDoc['mrp'] = update_spec['mrp']
                                elasticDoc['selling_price'] = update_spec['selling_price']
                                elasticDoc['discount'] = update_spec['discount']
                            elasticDoc['_id'] = document['_id']
                            elasticDoc['is_direct_update'] = True
                            self.upsert(elasticDoc, namespace, timestamp)


        else:
            if "_id" in update_spec:
                self.upsert(update_spec, namespace, timestamp)
            else:
                # update_spec["_id"] = document_id
                variantDoc = m_variant.find_one({"_id" : document_id})
                if variantDoc and "$set" in update_spec:
                    updatedValues = update_spec['$set']
                    for item in updatedValues:
                        if str(item) == "reason":
                            variantDoc['variant_reason'] = updatedValues[item]
                        else:
                            variantDoc[str(item)] = updatedValues[item]

                    variantDoc['variant_id'] = str(document_id)
                self.upsert(variantDoc, namespace, timestamp)
        

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""
        # try:
        #     print "calling : upsert with : "+str(doc)+","+str(namespace)
        # except Exception as e:
        #     print "Exception while calling print statement"
        #     print e

        LOG.info("calling : upsert with : "+str(doc)+","+str(namespace))
        # namespace = 'catalog.variant'
        # index, doc_type = self._index_and_mapping(namespace)
        def docs_to_upsert():
            elasticDocs = []
            # import pdb; pdb.set_trace()
            if 'is_direct_update' in doc:
                elasticDoc = doc
                elasticDoc.pop("is_direct_update")
                elasticDocs.append(elasticDoc)
                # index = "catalog"
                # doc_type = "variant"
                # namespace = 'catalog.variant'
                namespace = 'catalog.variant'
                index, doc_type = self._index_and_mapping(namespace)
                LOG.info("final object "+str(elasticDocs) + ", "+index +","+doc_type)
                # print "final object "+str(elasticDocs) + ", "+index +","+doc_type
            else:
                elasticDocs = elastic_doc(doc)
                namespace = 'catalog.variant'
                index, doc_type = self._index_and_mapping(namespace)
            for elasticDoc in elasticDocs:
                # Remove metadata and redundant _id
                
                doc_id = u(elasticDoc.pop("_id"))
                # Remove parent field
                # parent_id = self._get_parent_id_from_mongodb(index, doc_type,
                                                             # elasticDoc)
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(elasticDoc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": namespace,
                        "_ts": timestamp
                    }
                }

                # if parent_id is not None:
                #     document_action["_parent"] = parent_id

                yield document_action
                yield document_meta
            if elasticDocs is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        index, doc_type = self._index_and_mapping(namespace)
        if "variant" == doc_type or 'is_direct_update' in doc:
            try:
                kw = {}
                if self.chunk_size > 0:
                    kw['chunk_size'] = self.chunk_size

                responses = streaming_bulk(client=self.elastic,
                                           actions=docs_to_upsert(),
                                           **kw)

                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Could not bulk-upsert document "
                            "into ElasticSearch: %r" % resp)
                if self.auto_commit_interval == 0:
                    self.commit()
            except errors.EmptyDocsError:
                # This can happen when mongo-connector starts up, there is no
                # config file, but nothing to dump
                pass


    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        # print "calling : bulk_upsert"
        LOG.info("calling : bulk_upsert")
        def docs_to_upsert():
            doc = None
            for doc in docs:
                elasticDocs = elastic_doc(doc)
                for elasticDoc in elasticDocs:
                    doc = elasticDoc
                    # Remove metadata and redundant _id
                    index, doc_type = self._index_and_mapping(namespace)
                    doc_id = u(doc.pop("_id"))
                    # Remove parent field
                    # parent_id = self._get_parent_id_from_mongodb(index, doc_type,
                                                                 # doc)
                    document_action = {
                        "_index": index,
                        "_type": doc_type,
                        "_id": doc_id,
                        "_source": self._formatter.format_document(doc)
                    }
                    document_meta = {
                        "_index": self.meta_index_name,
                        "_type": self.meta_type,
                        "_id": doc_id,
                        "_source": {
                            "ns": namespace,
                            "_ts": timestamp
                        }
                    }

                    # if parent_id is not None:
                    #     document_action["_parent"] = parent_id

                    yield document_action
                    yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        index, doc_type = self._index_and_mapping(namespace)
        if "variant" == doc_type:
            try:
                kw = {}
                if self.chunk_size > 0:
                    kw['chunk_size'] = self.chunk_size

                responses = streaming_bulk(client=self.elastic,
                                           actions=docs_to_upsert(),
                                           **kw)

                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Could not bulk-upsert document "
                            "into ElasticSearch: %r" % resp)
                if self.auto_commit_interval == 0:
                    self.commit()
            except errors.EmptyDocsError:
                # This can happen when mongo-connector starts up, there is no
                # config file, but nothing to dump
                pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop('_id'))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {"type": "attachment"}
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {
            'ns': namespace,
            '_ts': timestamp,
        }

        # Remove parent id field
        parent_id = self._get_parent_id_from_mongodb(index, doc_type, doc)

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        parent_args = {}
        if parent_id is not None:
            parent_args['parent'] = parent_id

        self.elastic.index(
            index=index, doc_type=doc_type, body=doc, id=doc_id,
            refresh=(self.auto_commit_interval == 0), **parent_args)

        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        parent_args = {}
        if self._is_child_type(index, doc_type):
            # We can't use delete() directly here and have to do a full search
            # first. This is due to the fact that Elasticsearch needs the
            # parent ID to know where to route the delete request. We do
            # not have the parent ID available in our remove request though.
            document = self._search_doc_by_id(index, doc_type, document_id)
            if document is None:
                LOG.error('Could not find document with ID "%s" in '
                          'Elasticsearch to apply remove', u(document_id))
                return

            parent_id = self._get_parent_id_from_elastic(document)
            if parent_id is not None:
                parent_args['parent'] = parent_id

        self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id),
                            refresh=(self.auto_commit_interval == 0),
                            **parent_args)

        self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic, query=kwargs.pop('body', None),
                        scroll='10m', **kwargs):
            hit['_source']['_id'] = hit['_id']
            if '_parent' in hit:
                hit['_source']['_parent'] = hit['_parent']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "range": {
                        "_ts": {"gte": start_ts, "lte": end_ts}
                    }
                }
            })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts": "desc"}],
                },
                size=1
            )["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #13

Show file

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get('clientOptions', {})
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)   // config timeout

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get('autoSendInterval',
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commiter.join()
        self.auto_commit_interval = 0
        # Commit any remaining docs from buffer
        self.commit()

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll,
                    body={
                        "_source": {"enabled": True}
                    })

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type='delete') for result in scan(
                        self.elastic, index=db.lower(), doc_type=coll)))
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)

    @wrap_exceptions
    def search_exist(self, index, document_id):
        es = Elasticsearch()
        res = es.search(index = index, body={"query": {"match": {"_id": document_id}}})
        if res['hits']['total'] >= 1:
          return True
        else:
          return False
    @wrap_exceptions

    def search_doc(self, document_id):
        client = MongoClient("playdb01.prod.hcm.fplay", 27017, maxPoolSize=50)
        #DB Mongo Fteluv, Collection videos_v2
        db = client.fteluv
        collection = db['videos_ver2']
        cursor = collection.find({"_id": ObjectId(document_id)})
        return cursor
    @wrap_exceptions
    
    def key_value_update(self, update_spec):
        for key, value in update_spec.iteritems():
           print key
           for item in value:
              print item
        return key, item
    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        print "document_id: %s" %document_id
        print "update_spec: %s" %update_spec
        doc_id = str(document_id)
        document_id  = doc_id
        print "document_id again: %s" %document_id
        
        index, doc_type = self._index_and_mapping(namespace)
        with self.lock:
            # Check if document source is stored in local buffer
            document = self.BulkBuffer.get_from_sources(index,
                                                        doc_type,
                                                        u(document_id))
        if self.search_exist(index, document_id)== False:
            print " Not Found"
            docs = self.search_doc(document_id)
            print " docs :%s " % docs
            self.bulk_upsert_update(docs, namespace, timestamp)
        if document:
            # Document source collected from local buffer
            # Perform apply_update on it and then it will be
            # ready for commiting to Elasticsearch
            updated = self.apply_update(document, update_spec)
            # _id is immutable in MongoDB, so won't have changed in update
            updated['_id'] = document_id
            self.upsert(updated, namespace, timestamp)
        else:
            # Document source needs to be retrieved from Elasticsearch
            # before performing update. Pass update_spec to upsert function
            updated = {"_id": document_id}
            self.upsert(updated, namespace, timestamp, update_spec)
            
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp, update_spec=None):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {
            'ns': namespace,
            '_ts': timestamp
        }

        # Index the source document, using lowercase namespace as index name.
        action = {
            '_op_type': 'index',
            '_index': index,
            '_type': doc_type,
            '_id': doc_id,
            '_source': self._formatter.format_document(doc)
        }
        # Index document metadata with original namespace (mixed upper/lower).
        meta_action = {
            '_op_type': 'index',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': doc_id,
            '_source': bson.json_util.dumps(metadata)
        }

        self.index(action, meta_action, doc, update_spec)

        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                doc_status = doc["status"]
                if doc_status == 0:
                  print "to be continue!!"
                  continue
                document_action = {
                    '_index': index,
                    '_type': doc_type,
                    '_id': doc_id,
                    '_source': self._formatter.format_document(doc)
                }
                document_meta = {
                    '_index': self.meta_index_name,
                    '_type': self.meta_type,
                    '_id': doc_id,
                    '_source': {
                        'ns': namespace,
                        '_ts': timestamp
                    }
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
        print "Done Bulk_upsert"

    @wrap_exceptions

    def bulk_upsert_update(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                doc_status = doc["status"]
                if doc_status == 0:
                  print "to be continue!!"
                  continue
                document_action = {
                    '_index': index,
                    '_type': doc_type,
                    '_id': doc_id,
                    '_source': self._formatter.format_document(doc)
                }
                document_meta = {
                    '_index': self.meta_index_name,
                    '_type': self.meta_type,
                    '_id': doc_id,
                    '_source': {
                        'ns': namespace,
                        '_ts': timestamp
                    }
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
        print "Done Bulk_upsert_update"

    @wrap_exceptions

    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop('_id'))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {"type": "attachment"}
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {
            'ns': namespace,
            '_ts': timestamp,
        }

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        action = {
            '_op_type': 'index',
            '_index': index,
            '_type': doc_type,
            '_id': doc_id,
            '_source': doc
        }
        meta_action = {
            '_op_type': 'index',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': doc_id,
            '_source': bson.json_util.dumps(metadata)
        }

        self.index(action, meta_action)

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        action = {
            '_op_type': 'delete',
            '_index': index,
            '_type': doc_type,
            '_id': u(document_id)
        }

        meta_action = {
            '_op_type': 'delete',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': u(document_id)
        }

        self.index(action, meta_action)
        print "remove"

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic, query=kwargs.pop('body', None),
                        scroll='10m', **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "range": {
                        "_ts": {"gte": start_ts, "lte": end_ts}
                    }
                }
            })

    def index(self, action, meta_action, doc_source=None, update_spec=None):
        with self.lock:
            self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec)

        # Divide by two to account for meta actions
        if len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size or self.auto_commit_interval == 0:
            self.commit()

    def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug("Bulk request finished, successfully sent %d "
                              "operations", successes)
                    if errors:
                        LOG.error(
                            "Bulk request finished with errors: %r", errors)
            except es_exceptions.ElasticsearchException:
                LOG.exception("Bulk request failed with exception")

    def commit(self):
        """Send buffered requests and refresh all indexes."""
        self.send_buffered_operations()
        retry_until_ok(self.elastic.indices.refresh, index="")

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts": "desc"}],
                },
                size=1
            )["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #14

Show file

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 **kwargs):
        self.elastic = Elasticsearch(hosts=[url])
        self.auto_commit_interval = auto_commit_interval
        self.doc_type = 'string'  # default type is string, change if needed
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        document = self.elastic.get(index=doc['ns'], id=str(doc['_id']))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        # Add metadata fields back into updated, for the purposes of
        # calling upsert(). Need to do this until these become separate
        # arguments in 2.x
        updated['ns'] = doc['ns']
        updated['_ts'] = doc['_ts']
        self.upsert(updated)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Insert a document into Elasticsearch."""
        doc_type = self.doc_type
        index = doc.pop('ns')
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        metadata = {"ns": index, "_ts": doc.pop("_ts")}
        # Index the source document
        self.elastic.index(index=index,
                           doc_type=doc_type,
                           body=self._formatter.format_document(doc),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Index document metadata
        self.elastic.index(index=self.meta_index_name,
                           doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index = doc.pop("ns")
                doc_id = str(doc.pop("_id"))
                timestamp = doc.pop("_ts")
                document_action = {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    logging.error("Could not bulk-upsert document "
                                  "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def remove(self, doc):
        """Remove a document from Elasticsearch."""
        self.elastic.delete(index=doc['ns'],
                            doc_type=self.doc_type,
                            id=str(doc["_id"]),
                            refresh=(self.auto_commit_interval == 0))
        self.elastic.delete(index=self.meta_index_name,
                            doc_type=self.meta_type,
                            id=str(doc["_id"]),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic,
                        query=kwargs.pop('body', None),
                        scroll='10m',
                        **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(index=self.meta_index_name,
                                   body={
                                       "query": {
                                           "filtered": {
                                               "filter": {
                                                   "range": {
                                                       "_ts": {
                                                           "gte": start_ts,
                                                           "lte": end_ts
                                                       }
                                                   }
                                               }
                                           }
                                       }
                                   })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(index=self.meta_index_name,
                                         body={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "_ts": "desc"
                                             }],
                                         },
                                         size=1)["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #15

Show file

File: elastic_doc_manager.py Project: AdamsLee/mongo-connector

class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
        adds/removes documents, and in the case of rollback, searches for them.

        The reason for storing id/doc pairs as opposed to doc's is so that
        multiple updates to the same doc reflect the most up to date version as
        opposed to multiple, slightly different versions of a doc.

        We are using elastic native fields for _id and ns, but we also store
        them as fields in the document, due to compatibility issues.
        """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 **kwargs):
        """ Establish a connection to Elastic
        """
        self.elastic = Elasticsearch(hosts=[url])
        self.auto_commit_interval = auto_commit_interval
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit_interval = None

    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        document = self.elastic.get(index=doc['ns'], id=str(doc['_id']))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        self.upsert(updated)
        return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """
        doc_type = self.doc_type
        index = doc['ns']
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        self.elastic.index(index=index,
                           doc_type=doc_type,
                           body=self._formatter.format_document(doc),
                           id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Don't mutate doc argument
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Elastic

        docs may be any iterable
        """
        def docs_to_upsert():
            doc = None
            for doc in docs:
                index = doc["ns"]
                doc_id = str(doc.pop("_id"))
                yield {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    logging.error("Could not bulk-upsert document "
                                  "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        self.elastic.delete(index=doc['ns'],
                            doc_type=self.doc_type,
                            id=str(doc["_id"]),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic,
                        query=kwargs.pop('body', None),
                        scroll='10m',
                        **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Called to query Elastic for documents in a time range."""
        return self._stream_search(index="_all",
                                   body={
                                       "query": {
                                           "filtered": {
                                               "filter": {
                                                   "range": {
                                                       "_ts": {
                                                           "gte": start_ts,
                                                           "lte": end_ts
                                                       }
                                                   }
                                               }
                                           }
                                       }
                                   })

    def commit(self):
        """This function is used to force a refresh/commit.
        """
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commits to the Elastic server.
        """
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """
        try:
            result = self.elastic.search(index="_all",
                                         body={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "_ts": "desc"
                                             }],
                                         },
                                         size=1)["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #16

Show file

File: elastic_doc_manager.py Project: quintstoffers/elastic-doc-manager

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        hosts = self._get_hosts(url)
        self.elastic = Elasticsearch(
            hosts=hosts, timeout=60, **kwargs.get('clientOptions', {}))
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

    def _get_hosts(self, url):
        if isinstance(url, list):
            return url
        elif isinstance(url, str):
            return url.split(',')
        else:
            raise errors.ConnectionFailed("Invalid URI for Elastic")

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll,
                    body={
                        "_source": {"enabled": True}
                    })

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                self.elastic.indices.delete_mapping(index=db.lower(),
                                                    doc_type=coll)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        index, doc_type = self._index_and_mapping(namespace)
        document = self.elastic.get(index=index, doc_type=doc_type,
                                    id=u(document_id))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        self.upsert(updated, namespace, timestamp)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {
            "ns": namespace,
            "_ts": timestamp
        }
        # Index the source document, using lowercase namespace as index name.
        self.elastic.index(index=index, doc_type=doc_type,
                           body=self._formatter.format_document(doc), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Index document metadata with original namespace (mixed upper/lower).
        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": namespace,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop('_id'))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {"type": "attachment"}
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {
            'ns': namespace,
            '_ts': timestamp,
        }

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        self.elastic.index(index=index, doc_type=doc_type,
                           body=doc, id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        self.elastic.delete(index=index, doc_type=doc_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))
        self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic, query=kwargs.pop('body', None),
                        scroll='10m', **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "_ts": {"gte": start_ts, "lte": end_ts}
                            }
                        }
                    }
                }
            })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts": "desc"}],
                },
                size=1
            )["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #17

Show file

File: neo4j_doc_manager.py Project: hannelita/neo4j_doc_manager

class DocManager(DocManagerBase):
  """
  Neo4j implementation for the DocManager. Receives documents and 
  communicates with Neo4j Server.
  """

  def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
    
    self.graph = Graph(url)
    self.auto_commit_interval = auto_commit_interval
    self.unique_key = unique_key
    self.chunk_size = chunk_size
    self._formatter = DefaultDocumentFormatter()
    self.kwargs = kwargs.get("clientOptions")

  def apply_id_constraint(self, doc_types):
    for doc_type in doc_types:
      constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d._id IS UNIQUE".format(doc_type=doc_type)
      self.graph.cypher.execute(constraint)

  def stop(self):
    """Stop the auto-commit thread."""
    self.auto_commit_interval = None
  
  @wrap_exceptions
  def upsert(self, doc, namespace, timestamp):
    """Inserts a document into Neo4j."""
    index, doc_type = self._index_and_mapping(namespace)
    doc_id = u(doc.pop("_id"))
    metadata = { "ns": namespace, "_ts": timestamp }
    doc = self._formatter.format_document(doc)
    builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id)
    self.apply_id_constraint(builder.doc_types)
    tx = self.graph.cypher.begin()
    for statement in builder.query_nodes.keys():
      tx.append(statement, builder.query_nodes[statement])
    for relationship in builder.relationships_query.keys():
      tx.append(relationship, builder.relationships_query[relationship])
    tx.commit()

  @wrap_exceptions
  def bulk_upsert(self, docs, namespace, timestamp):
    """Insert multiple documents into Neo4j."""
    """Maximum chunk size is 1000. Transaction blocks won't have more than 1000 statements."""
    metadata = { "ns": namespace, "_ts": timestamp }
    tx = self.graph.cypher.begin()
    for doc in docs:
      index, doc_type = self._index_and_mapping(namespace)
      doc_id = u(doc.pop("_id"))
      doc = self._formatter.format_document(doc)
      builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id)
      self.apply_id_constraint(builder.doc_types)
      for statement in builder.query_nodes.keys():
        tx.append(statement, builder.query_nodes[statement])
      for relationship in builder.relationships_query.keys():
        tx.append(relationship, builder.relationships_query[relationship])
    tx.commit()

  @wrap_exceptions
  def update(self, document_id, update_spec, namespace, timestamp):
    doc_id = u(document_id)
    tx = self.graph.cypher.begin()
    index, doc_type = self._index_and_mapping(namespace)
    updater = NodesAndRelationshipsUpdater()
    updater.run_update(update_spec, doc_id, doc_type)
    for statement in updater.statements_with_params:
      for key in statement.keys():
        tx.append(key, statement[key])
    tx.commit()

  @wrap_exceptions
  def remove(self, document_id, namespace, timestamp):
    """Removes a document from Neo4j."""
    doc_id = u(document_id)
    index, doc_type = self._index_and_mapping(namespace)
    params_dict = {"doc_id": doc_id}
    tx = self.graph.cypher.begin()
    statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r"
    tx.append(statement, params_dict)
    tx.commit()

  def search(self, start_ts, end_ts):
    LOG.error("Search")

  def commit(self):
    LOG.error("Commit")
    

  def get_last_doc(self):
    LOG.error("get last doc")
    
  def handle_command(self, doc, namespace, timestamp):
    db = namespace.split('.', 1)[0]

  def _index_and_mapping(self, namespace):
    """Helper method for getting the index and type from a namespace."""
    index, doc_type = namespace.split('.', 1)
    return index.lower(), doc_type

Example #18

Show file

File: elastic2_doc_manager.py Project: mongodb-labs/elastic2-doc-manager

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(
        self,
        url,
        auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
        unique_key="_id",
        chunk_size=DEFAULT_MAX_BULK,
        meta_index_name="mongodb_meta",
        meta_type="mongodb_meta",
        attachment_field="content",
        **kwargs
    ):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]"
                )
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = True
            client_options["connection_class"] = es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(
            self, self.auto_send_interval, self.auto_commit_interval
        )
        self.auto_commiter.start()

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split(".", 1)
        return index.lower(), doc_type

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commiter.join()
        self.auto_commit_interval = 0
        # Commit any remaining docs from buffer
        self.commit()

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping."
            )

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}}
                )

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn(
                    "Deleting all documents of type %s on index %s."
                    "The mapping definition will persist and must be"
                    "removed manually." % (coll, db)
                )
                responses = streaming_bulk(
                    self.elastic,
                    (
                        dict(result, _op_type="delete")
                        for result in scan(
                            self.elastic, index=db.lower(), doc_type=coll
                        )
                    ),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp
                        )

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """

        index, doc_type = self._index_and_mapping(namespace)
        with self.lock:
            # Check if document source is stored in local buffer
            document = self.BulkBuffer.get_from_sources(
                index, doc_type, str(document_id)
            )
        if document:
            # Document source collected from local buffer
            # Perform apply_update on it and then it will be
            # ready for commiting to Elasticsearch
            updated = self.apply_update(document, update_spec)
            # _id is immutable in MongoDB, so won't have changed in update
            updated["_id"] = document_id
            self.upsert(updated, namespace, timestamp)
        else:
            # Document source needs to be retrieved from Elasticsearch
            # before performing update. Pass update_spec to upsert function
            updated = {"_id": document_id}
            self.upsert(updated, namespace, timestamp, update_spec)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp, update_spec=None):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        metadata = {"ns": namespace, "_ts": timestamp}

        # Index the source document, using lowercase namespace as index name.
        action = {
            "_op_type": "index",
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": self._formatter.format_document(doc),
        }
        # Index document metadata with original namespace (mixed upper/lower).
        meta_action = {
            "_op_type": "index",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": doc_id,
            "_source": bson.json_util.dumps(metadata),
        }

        self.index(action, meta_action, doc, update_spec)

        # Leave _id, since it's part of the original document
        doc["_id"] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""

        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {"ns": namespace, "_ts": timestamp},
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search"
                )

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            responses = streaming_bulk(
                client=self.elastic, actions=docs_to_upsert(), **kw
            )

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp
                    )
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop("_id"))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {"properties": {self.attachment_field: {"type": "attachment"}}}
            self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body)
            self.has_attachment_mapping = True

        metadata = {"ns": namespace, "_ts": timestamp}

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        action = {
            "_op_type": "index",
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": doc,
        }
        meta_action = {
            "_op_type": "index",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": doc_id,
            "_source": bson.json_util.dumps(metadata),
        }

        self.index(action, meta_action)

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        action = {
            "_op_type": "delete",
            "_index": index,
            "_type": doc_type,
            "_id": str(document_id),
        }

        meta_action = {
            "_op_type": "delete",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": str(document_id),
        }

        self.index(action, meta_action)

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(
            self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs
        ):
            hit["_source"]["_id"] = hit["_id"]
            yield hit["_source"]

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={"query": {"range": {"_ts": {"gte": start_ts, "lte": end_ts}}}},
        )

    def index(self, action, meta_action, doc_source=None, update_spec=None):
        with self.lock:
            self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec)

        # Divide by two to account for meta actions
        if (
            len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size
            or self.auto_commit_interval == 0
        ):
            self.commit()

    def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug(
                        "Bulk request finished, successfully sent %d " "operations",
                        successes,
                    )
                    if errors:
                        LOG.error("Bulk request finished with errors: %r", errors)
            except es_exceptions.ElasticsearchException:
                LOG.exception("Bulk request failed with exception")

    def commit(self):
        """Send buffered requests and refresh all indexes."""
        self.send_buffered_operations()
        retry_until_ok(self.elastic.indices.refresh, index="")

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={"query": {"match_all": {}}, "sort": [{"_ts": "desc"}]},
                size=1,
            )["hits"]["hits"]
            for r in result:
                r["_source"]["_id"] = r["_id"]
                return r["_source"]
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #19

Show file

File: elastic2_doc_manager.py Project: RiffynInc/elastic2-doc-manager

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        else:
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection

        if type(url) is not list:
            url = [url]

        LOG.always('URL IN DOC MANAGER:')
        LOG.always(url)

        # self.elastic = Elasticsearch(hosts=url, **client_options)
        protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED')
                              == "false") else "https"
        username = os.environ.get('ELASTIC_USER')
        password = os.environ.get('ELASTIC_PASSWORD')
        hostname = os.environ.get('ELASTIC_HOST')
        port = os.environ.get('ELASTIC_PORT')

        timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30))
        max_retries = int(
            __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20))
        retry_on_timeout = bool(
            int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT',
                                              True)))

        LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout))
        LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries))
        LOG.info(
            " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout))

        # We're not using sniffing now - we will fix it using Connection with credentials.
        sniff_on_start = bool(
            int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True)))
        sniff_on_connection_fail = bool(
            int(
                __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL',
                                              True)))
        sniffer_timeout = int(
            __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20))

        if username and password:
            elastic_url = "{0}://{1}:{2}@{3}:{4}/".format(
                protocol, username, password, hostname, port)
        else:
            elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port)

        LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:')
        LOG.always(elastic_url)

        if os.environ.get('ELASTIC_SSL_ENABLED') == "false":
            use_ssl = False
        else:
            use_ssl = True

        # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch
        # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
        # https://elasticsearch-py.readthedocs.io/en/master/#sniffing
        # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit
        # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392
        self.elastic = Elasticsearch(
            hosts=[elastic_url],
            verify_certs=False,
            use_ssl=use_ssl,
            timeout=timeout,
            max_retries=max_retries,
            retry_on_timeout=retry_on_timeout
            # sniff_on_start=sniff_on_start,
            # sniff_on_connection_fail=sniff_on_connection_fail,
            # sniffer_timeout=sniffer_timeout
        )

        self.summary_title = 'dm_ingestion_time'
        self.counter_title = 'dm_ingest'
        self.REQUEST_TIME = Summary(self.summary_title,
                                    'Bulk operations throughput')
        self.ingest_rate = Counter(
            self.counter_title,
            'Number of documents ingested per bulk operation',
            ['collectionName'])

        self.doc_summary_title = 'new_doc_operation_time'
        self.doc_count_title = 'new_doc_operation'
        self.REQUEST_TIME_OP = Summary(
            self.doc_summary_title,
            'Operations on documents for Elasticsearch')
        self.doc_operation_count = Counter(self.doc_count_title,
                                           'Document operation',
                                           ['operation_type', 'index'])

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()

        # with open('./config/mapping_resources_and_run_data.json', 'r') as mapping_config:
        #     try:
        #         # local_mapping = json.load(mapping_config)
        #         # local_mapping = str(local_mapping)
        #
        #         # try:
        #         #     es_mapping = self.elastic.indices.get_mapping(index='resources_and_run_data')
        #         #     es_mapping = es_mapping\
        #         #         .get('resources_and_run_data')\
        #         #         .get('mappings')\
        #         #         .get('resources_and_run_data')
        #         #
        #         #     # es_mapping = str(es_mapping)
        #         #
        #         #     # is_mapping_correct = local_mapping == es_mapping
        #         #     is_mapping_correct = diff(local_mapping, es_mapping)
        #         #
        #         #     LOG.always('*******************************************')
        #         #     LOG.always('LOCAL')
        #         #     LOG.always(local_mapping)
        #         #     LOG.always('*******************************************')
        #         #     LOG.always(' ')
        #         #     LOG.always(' ')
        #         #     LOG.always('*******************************************')
        #         #     LOG.always('ES')
        #         #     LOG.always(es_mapping)
        #         #     LOG.always('*******************************************')
        #         #
        #         #     LOG.always('*******************************************')
        #         #     LOG.always('diff')
        #         #     LOG.always(is_mapping_correct)
        #         #     LOG.always('*******************************************')
        #
        #             # if not is_mapping_correct:
        #
        #         except errors.ConnectionFailed:
        #             LOG.exception(
        #                 'Could not load mapping config on Elasticsearch'
        #             )
        #     except ValueError:
        #         LOG.exception(
        #             'Could not load mappings file'
        #         )
        #
        #         return

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split(".", 1)
        return index.lower(), doc_type

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commiter.join()
        self.auto_commit_interval = 0
        # Commit any remaining docs from buffer
        self.commit()

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(),
                    doc_type=coll,
                    body={"_source": {
                        "enabled": True
                    }})

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type="delete") for result in scan(
                        self.elastic, index=db.lower(), doc_type=coll)),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """

        index, doc_type = self._index_and_mapping(namespace)
        with self.lock:
            # Check if document source is stored in local buffer
            document = self.BulkBuffer.get_from_sources(
                index, doc_type, str(document_id))

        LOG.debug('_________________________ UPDATING FILE')
        LOG.debug(update_spec)

        if document:
            # Document source collected from local buffer
            # Perform apply_update on it and then it will be
            # ready for commiting to Elasticsearch
            updated = self.apply_update(document, update_spec)
            # _id is immutable in MongoDB, so won't have changed in update
            updated["_id"] = document_id
            self.upsert(updated, namespace, timestamp, None, True)
        else:
            # Document source needs to be retrieved from Elasticsearch
            # before performing update. Pass update_spec to upsert function
            updated = {"_id": document_id}
            self.upsert(updated, namespace, timestamp, update_spec, False)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self,
               doc,
               namespace,
               timestamp,
               update_spec=None,
               is_update=False):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        metadata = {"ns": namespace, "_ts": timestamp}

        action_source = self._formatter.format_document(doc)

        # Index the source document, using lowercase namespace as index name.
        action = {
            "_op_type": "index",
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": action_source,
        }

        LOG.debug('_________________________ UPSERTING FILE')

        meta_action_source = bson.json_util.dumps(metadata)

        # Index document metadata with original namespace (mixed upper/lower).
        meta_action = {
            "_op_type": "index",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": doc_id,
            "_source": meta_action_source,
        }

        if is_update:
            action['_update'] = True
            meta_action['_update'] = True

        self.index(action, meta_action, doc, update_spec)

        # Leave _id, since it's part of the original document
        doc["_id"] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp, collectionName):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc_count = 0
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                routing = False

                if os.environ.get('JOIN_INDEX'):
                    if namespace == os.environ.get(
                            'JOIN_INDEX') + "." + os.environ.get('JOIN_INDEX'):
                        if doc.get(
                                os.environ.get('CHILD_FIELD_1')) and doc.get(
                                    os.environ.get('CHILD_FIELD_2')):
                            routing = True
                            doc["data_join"] = {
                                "name": os.environ.get('JOIN_FIELD'),
                                "parent": doc.get(os.environ.get('JOIN_FIELD'))
                            }
                        else:
                            doc["data_join"] = {"name": "_id"}

                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }

                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": namespace,
                        "_ts": timestamp
                    },
                }

                if routing is True:
                    document_meta["_routing"] = doc.get(
                        os.environ.get('JOIN_FIELD'))
                    document_action["_routing"] = doc.get(
                        os.environ.get('JOIN_FIELD'))

                yield document_action
                yield document_meta

                doc_count += 1
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

            LOG.always(" - - - - - COLLECTION")
            LOG.always(collectionName)
            LOG.always(" - - - - - # OF DOCS")
            LOG.always(doc_count)

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            kw["max_retries"] = 10

            ns, ns2 = namespace.split(".", 1)

            if collectionName:
                index_name, ns = collectionName.split(".", 1)

            @self.REQUEST_TIME.time()
            def process_request(metric):
                metric.inc()

            @ERROR_TIME.time()
            def error_catch(error):
                error.inc()

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        '_ Could not bulk-upsert document. ERROR RESP: {r}'.
                        format(r=resp))

                    error_catch(
                        ERROR_CAUGHT.labels(
                            'Could not bulk-upsert document into ElasticSearch',
                            resp))
                else:
                    if resp.get('index').get('_type') != 'mongodb_meta':
                        process_request(self.ingest_rate.labels(ns))

            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop("_id"))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {
                        "type": "attachment"
                    }
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {"ns": namespace, "_ts": timestamp}

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        action = {
            "_op_type": "index",
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": doc,
        }
        meta_action = {
            "_op_type": "index",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": doc_id,
            "_source": bson.json_util.dumps(metadata),
        }

        LOG.debug('_________________________ INSERTING FILE')

        self.index(action, meta_action)

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        action = {
            "_op_type": "delete",
            "_index": index,
            "_type": doc_type,
            "_id": str(document_id),
        }

        meta_action = {
            "_op_type": "delete",
            "_index": self.meta_index_name,
            "_type": self.meta_type,
            "_id": str(document_id),
        }

        # When removing a runData doc, we need to get the routing field into our action data
        # This allows the parent+child relationship to successfully dissolve on removal
        # Without the _routing field, this operation will throw an exception
        if os.environ.get('JOIN_INDEX') and (index
                                             == os.environ.get('JOIN_INDEX')):
            try:
                hit = self.elastic.search(
                    index=index,
                    body={"query": {
                        "match": {
                            "_id": str(document_id)
                        }
                    }},
                    size=1)["hits"]["hits"]

                for result in hit:
                    if result and result['_routing']:
                        action['_routing'] = result['_routing']
                        meta_action['_routing'] = result['_routing']
            except:
                LOG.error(
                    'EXCEPTION: COULD NOT FIND DOCUMENT IN ELASTICSEARCH FOR REMOVAL OPERATION'
                )

        self.index(action, meta_action)

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic,
                        query=kwargs.pop("body", None),
                        scroll="10m",
                        **kwargs):
            hit["_source"]["_id"] = hit["_id"]
            yield hit["_source"]

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "range": {
                        "_ts": {
                            "gte": start_ts,
                            "lte": end_ts
                        }
                    }
                }
            },
        )

    def index(self, action, meta_action, doc_source=None, update_spec=None):
        if os.environ.get('JOIN_INDEX'):
            namespace = action["_type"]
            if namespace == os.environ.get('JOIN_INDEX'):
                if doc_source:
                    is_child1 = doc_source.get(os.environ.get('CHILD_FIELD_1')) and \
                                doc_source.get(os.environ.get('CHILD_FIELD_2'))
                    is_child2 = action['_source'].get(os.environ.get('CHILD_FIELD_1')) and \
                                action['_source'].get(os.environ.get('CHILD_FIELD_2'))

                    if is_child1 or is_child2:
                        action['_source']['data_join'] = {
                            "name":
                            os.environ.get('JOIN_FIELD'),
                            "parent":
                            action['_source'][os.environ.get('JOIN_FIELD')]
                        }
                        doc_source['data_join'] = {
                            "name": os.environ.get('JOIN_FIELD'),
                            "parent": doc_source[os.environ.get('JOIN_FIELD')]
                        }
                        action["_routing"] = doc_source.get(
                            os.environ.get('JOIN_FIELD'))
                        meta_action["_routing"] = doc_source.get(
                            os.environ.get('JOIN_FIELD'))
                    else:
                        action['_source']['data_join'] = {'name': '_id'}
                        doc_source['data_join'] = {'name': '_id'}

        with self.lock:
            self.BulkBuffer.add_upsert(action, meta_action, doc_source,
                                       update_spec)

        # Divide by two to account for meta actions
        if (len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size
                or self.auto_commit_interval == 0):
            self.commit()

    def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:

            @ERROR_TIME.time()
            def error_catch(error):
                error.inc()

            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug(
                        "Bulk request finished, successfully sent %d "
                        "operations",
                        successes,
                    )

                    LOG.debug(' ')
                    LOG.debug(' ')
                    LOG.debug('*****************************************')
                    LOG.debug(' ')
                    LOG.debug('SUCCESSES')
                    LOG.debug(successes)
                    LOG.debug(' ')
                    LOG.debug('ACTION BUFFER')
                    LOG.debug(action_buffer)
                    LOG.debug(' ')
                    LOG.debug('*****************************************')
                    LOG.debug(' ')
                    LOG.debug(' ')

                    if errors:
                        for error in errors:
                            error_catch(
                                ERROR_CAUGHT.labels('Bulk request error',
                                                    error))

                        LOG.error("Bulk request finished with errors: %r",
                                  errors)

                    # TODO: Add collection name as label
                    @self.REQUEST_TIME_OP.time()
                    def process_request(operation_type, index):
                        self.doc_operation_count.labels(operation_type,
                                                        index).inc()

                    doc = action_buffer[0]
                    index = doc.get('_index')
                    operation_type = doc.get('_op_type')

                    if doc.get('_update'):
                        process_request('update', index)
                        LOG.debug('UPDATE!')
                    elif operation_type == 'index':
                        process_request('add', index)
                        LOG.debug('ADD!')
                    elif operation_type == 'delete':
                        process_request('remove', index)
                        LOG.debug('REMOVE!')

                    # LOG.always(
                    #     "Counter: Documents removed: %d, "
                    #     "inserted: %d, updated: %d so far" % (
                    #         op_remove, op_add, op_update))

            except es_exceptions.ElasticsearchException as e:
                error_catch(
                    ERROR_CAUGHT.labels('Bulk request failed with exception',
                                        'send_buffered_operations'))
                LOG.exception(
                    "Bulk request failed with exception {}".format(e))

    def commit(self):
        """Send buffered requests and refresh all indexes."""
        self.send_buffered_operations()
        retry_until_ok(self.elastic.indices.refresh, index="")

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {
                        "match_all": {}
                    },
                    "sort": [{
                        "_ts": "desc"
                    }]
                },
                size=1,
            )["hits"]["hits"]
            for r in result:
                r["_source"]["_id"] = r["_id"]
                return r["_source"]
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #20

Show file

File: elastic2_doc_manager.py Project: RiffynInc/elastic2-doc-manager

    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        else:
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection

        if type(url) is not list:
            url = [url]

        LOG.always('URL IN DOC MANAGER:')
        LOG.always(url)

        # self.elastic = Elasticsearch(hosts=url, **client_options)
        protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED')
                              == "false") else "https"
        username = os.environ.get('ELASTIC_USER')
        password = os.environ.get('ELASTIC_PASSWORD')
        hostname = os.environ.get('ELASTIC_HOST')
        port = os.environ.get('ELASTIC_PORT')

        timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30))
        max_retries = int(
            __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20))
        retry_on_timeout = bool(
            int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT',
                                              True)))

        LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout))
        LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries))
        LOG.info(
            " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout))

        # We're not using sniffing now - we will fix it using Connection with credentials.
        sniff_on_start = bool(
            int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True)))
        sniff_on_connection_fail = bool(
            int(
                __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL',
                                              True)))
        sniffer_timeout = int(
            __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20))

        if username and password:
            elastic_url = "{0}://{1}:{2}@{3}:{4}/".format(
                protocol, username, password, hostname, port)
        else:
            elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port)

        LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:')
        LOG.always(elastic_url)

        if os.environ.get('ELASTIC_SSL_ENABLED') == "false":
            use_ssl = False
        else:
            use_ssl = True

        # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch
        # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
        # https://elasticsearch-py.readthedocs.io/en/master/#sniffing
        # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit
        # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392
        self.elastic = Elasticsearch(
            hosts=[elastic_url],
            verify_certs=False,
            use_ssl=use_ssl,
            timeout=timeout,
            max_retries=max_retries,
            retry_on_timeout=retry_on_timeout
            # sniff_on_start=sniff_on_start,
            # sniff_on_connection_fail=sniff_on_connection_fail,
            # sniffer_timeout=sniffer_timeout
        )

        self.summary_title = 'dm_ingestion_time'
        self.counter_title = 'dm_ingest'
        self.REQUEST_TIME = Summary(self.summary_title,
                                    'Bulk operations throughput')
        self.ingest_rate = Counter(
            self.counter_title,
            'Number of documents ingested per bulk operation',
            ['collectionName'])

        self.doc_summary_title = 'new_doc_operation_time'
        self.doc_count_title = 'new_doc_operation'
        self.REQUEST_TIME_OP = Summary(
            self.doc_summary_title,
            'Operations on documents for Elasticsearch')
        self.doc_operation_count = Counter(self.doc_count_title,
                                           'Document operation',
                                           ['operation_type', 'index'])

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()

Example #21

Show file

File: neo4j_doc_manager.py Project: mayankchutani/mongo-connector

class DocManager(DocManagerBase):
    """
    Neo4j implementation for the DocManager. Receives documents and
    communicates with Neo4j Server.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='uid', chunk_size=DEFAULT_MAX_BULK, **kwargs):

        self.graph = Graph(url)
        self.url = url
        self.auto_commit_interval = auto_commit_interval
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self._formatter = DefaultDocumentFormatter()
        self.kwargs = kwargs.get("clientOptions")
        self.authorization_token = base64.b64encode(os.getenv('NEO4J_AUTH'))

    def apply_id_constraint(self, doc_types):
        for doc_type in doc_types:
            doc_type = doc_type.upper()
            constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d.uid IS UNIQUE".format(doc_type=doc_type)
            self.graph.cypher.execute(constraint)

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Inserts a document into Neo4j."""
        index, doc_type = self._index_and_mapping(namespace)
        doc_id = u(doc.pop("uid"))
        metadata = { "_ts": timestamp }
        doc = self._formatter.format_document(doc)
        builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
        self.apply_id_constraint(builder.doc_types)
        tx = self.graph.cypher.begin()
        for statement in builder.query_nodes.keys():
            tx.append(statement, builder.query_nodes[statement])
        for query in builder.cypher_list:
            tx.append(query)
            # Adding cyphers from cypher list
        for relationship, params in builder.relationships_query:
            tx.append(relationship, params)
        for statement in builder.statements_with_params:
            for key in statement.keys():
                tx.append(key, statement[key])
        commit_result = None
        try:
            commit_result = tx.commit()
            print commit_result
        except Exception as e:
            LOG.error('{}'.format(e.message))
            pass

        if commit_result:
            nodeids_list = self._get_nodeids(commit_result)
            self.create_geospatial_indices(nodeids_list)

    def _get_nodeids(self, commit_result):
        node_id_list = []
        a = len(commit_result)
        for i in range(len(commit_result)):
            res = commit_result.pop(0)
            records = res.records
            if not records:
                continue
            for record in records:
                node_ids = list(record.__values__)
                node_id_list.extend(node_ids)
        return node_id_list

    def create_geospatial_indices(self, node_ids_list):
        """
        Creates geo spatial indices on the node ids
        :param node_ids_list:  list of node ids
        """
        layer_name = 'geom'
        lat = 'lat'
        lon = 'lon'
        geometry_type = 'point'
        self._set_id_to_nodeid(node_ids_list)
        # if_layer = self.if_layer_exists(layer_name)
        # if if_layer:
        self._create_layer(layer_name, lat, lon)
        self._add_geometry(layer_name, geometry_type, lat, lon)
        result = self._add_node_to_layer(node_ids_list, layer_name)
        LOG.info('Geospatial index creation response {}', repr(result))

    def _set_id_to_nodeid(self, node_ids_list):
        # TODO: We may want it to change to label name
        """
        Set id on basis of node ids
        :param node_ids_list:
        :param label_name:
        :return:
        """
        tx = self.graph.cypher.begin()
        for count, nodeid in enumerate(node_ids_list, 1):
            if count % 1000 == 0:
                tx.commit()
                tx = self.graph.cypher.begin()
            query = 'MATCH (n) where id(n) = {nodeid} set n.id={nodeid}'.format(nodeid=nodeid)
            tx.append(query)
        if not tx.finished:
            tx.commit()

    def _add_node_to_layer(self, node_ids_list, layer_name):
        """
        Adds nodes to layer
        :param node_ids_list: list of node ids
        :param layer_name: <string>
        :return: [(nodeid, res)]
        """
        endpoint = '/ext/SpatialPlugin/graphdb/addNodeToLayer'
        url = self.url + endpoint
        result_list = []
        for nodeid in node_ids_list:
            node_endpoint = '/node/{}'.format(nodeid)
            node = self.url + node_endpoint
            payload = {'layer': layer_name,
                       'node': node}
            res = self._post_request(url, payload=payload)
            result_list.append((nodeid, res))
        return result_list

    def _add_geometry(self, layer_name, geometry_type, lat, lon):
        """ Creates a geometry """
        endpoint = '/index/node'
        url = self.url + endpoint
        payload = {'name': layer_name,
                              'config': {
                                  'provider': 'spatial',
                                  'geometry_type': geometry_type,
                                  'lat': lat,
                                  'lon': lon}
                              }
        res = self._post_request(url, payload=payload)
        if res.status_code == 201:
            LOG.info('Geometry {} created'.format(geometry_type))
            return True, res
        else:
            LOG.error('Gometry creation error: {}'.format(geometry_type))
            return False, res

    def _create_layer(self, layer_name, lat, lon):
        """
        Creates a layer
        :param layer_name: <string>
        :return: (<bool>, res)
        """
        endpoint = '/ext/SpatialPlugin/graphdb/addSimplePointLayer'
        url = self.url + endpoint
        payload = {'layer': layer_name,
                              'lat': lat,
                              'lon': lon}
        res = self._post_request(url, payload=payload)
        if res.status_code == 200:
            LOG.info('Layer \'{}\' created successfully'.format(layer_name))
            return True, res
        else:
            LOG.error('Layer creation error code: {} - {}'.format(res.status_code, res))
            return False, res

    def _post_request(self, url, payload):
        payload = json.dumps(payload)
        headers = {'authorization': self.authorization_token,
                   'content-type': 'application/json'}
        res = req.post(url, data=payload, headers=headers)
        return res

    def if_layer_exists(self, layer_name):
        endpoint = '/ext/SpatialPlugin/graphdb/getLayer'
        url = self.url + endpoint
        payload = {'layer': layer_name}
        res = self._post_request(url, payload=payload)
        if res.status_code == 200:
            return True
        else:
            return False

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Neo4j."""
        """Maximum chunk size is 1000. Transaction blocks won't have more than 1000 statements."""
        metadata = { "_ts": timestamp }
        tx = self.graph.cypher.begin()
        for doc in docs:
            index, doc_type = self._index_and_mapping(namespace)
            doc_id = u(doc.pop("uid"))
            doc = self._formatter.format_document(doc)
            builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
            self.apply_id_constraint(builder.doc_types)
            for statement in builder.query_nodes.keys():
                tx.append(statement, builder.query_nodes[statement])
            for query in builder.cypher_list:
                tx.append(query)
                # Adding cyphers from cypher list
            for relationship, params in builder.relationships_query:
                tx.append(relationship, params)
            for statement in builder.statements_with_params:
                for key in statement.keys():
                    tx.append(key, statement[key])
        try:
            tx.commit()
        except Exception as e:
            LOG.error('{}'.format(e.message))
            pass

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        doc_id = u(document_id)
        tx = self.graph.cypher.begin()
        index, doc_type = self._index_and_mapping(namespace)
        updater = NodesAndRelationshipsUpdater()
        updater.run_update(update_spec, doc_id, doc_type)
        for statement in updater.statements_with_params:
            for key in statement.keys():
                tx.append(key, statement[key])
        tx.commit()

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Removes a document from Neo4j."""
        doc_id = u(document_id)
        index, doc_type = self._index_and_mapping(namespace)
        params_dict = {"doc_id": doc_id}
        tx = self.graph.cypher.begin()
        statement = "MATCH (d:Document) WHERE d.uid={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r"
        tx.append(statement, params_dict)
        tx.commit()

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        statement = "MATCH (d:Document) WHERE d._ts>={start_ts} AND d._ts<={end_ts} RETURN d".format(start_ts=start_ts, end_ts=end_ts)
        results = self.graph.cypher.execute(statement)
        return results


    def commit(self):
        LOG.error("Commit")


    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified node from Neo4j.
        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        LOG.error("Commit")


    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

Example #22

Show file

File: elastic2_doc_manager.py Project: mallegrini/elastic2-doc-manager

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        self.elastic = Elasticsearch(
            hosts=[url], **kwargs.get('clientOptions', {}))
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.routing = kwargs.get('routing', {})
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split('.', 1)
        return index.lower(), doc_type

    def _get_parent_id(self, doc_type, doc):
        """Get parent ID from doc"""
        if doc_type in self.routing:
            if '_parent' in doc:
                return doc.pop('_parent')

            parent_field = self.routing[doc_type].get('parentField')

            if not parent_field:
                return None

            parent_id = doc.pop(parent_field) if parent_field in doc else None
            return self._formatter.transform_value(parent_id)

    def _search_doc_by_id(self, index, doc_type, doc_id):
        """Search document in Elasticsearch by _id"""
        result = self.elastic.search(index=index, doc_type=doc_type,
                                     body={
                                         'query': {
                                             'ids': {
                                                 'type': doc_type,
                                                 'values': [u(doc_id)]
                                             }
                                         }
                                     })
        if result['hits']['total'] == 1:
            return result['hits']['hits'][0]
        else:
            return None

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll,
                    body={
                        "_source": {"enabled": True}
                    })

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type='delete') for result in scan(
                        self.elastic, index=db.lower(), doc_type=coll)))
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        index, doc_type = self._index_and_mapping(namespace)

        if doc_type in self.routing and 'parentField' in self.routing[doc_type]:
            # We can't use get() here and have to do a full search instead.
            # This is due to the fact that Elasticsearch needs the parent ID to
            # know where to route the get request. We might not have the parent
            # ID available in our update request though.
            document = self._search_doc_by_id(index, doc_type, document_id)
            if document is None:
                LOG.error('Could not find document with ID "%s" in Elasticsearch to apply update', u(document_id))
                return None
        else:
            document = self.elastic.get(index=index, doc_type=doc_type,
                                        id=u(document_id))

        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        if '_parent' in document:
            updated['_parent'] = document['_parent']
        self.upsert(updated, namespace, timestamp)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {
            "ns": namespace,
            "_ts": timestamp
        }

        parent_id = self._get_parent_id(doc_type, doc)
        # Index the source document, using lowercase namespace as index name.
        if parent_id is None:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=self._formatter.format_document(doc), id=doc_id,
                               refresh=(self.auto_commit_interval == 0))
        else:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=self._formatter.format_document(doc), id=doc_id,
                               parent=parent_id, refresh=(self.auto_commit_interval == 0))

        # Index document metadata with original namespace (mixed upper/lower).
        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }

                parent_id = self._get_parent_id(doc_type, doc)
                if parent_id is not None:
                    document_action["_parent"] = parent_id
                    document_action["_source"] = self._formatter.format_document(doc)

                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        doc = f.get_metadata()
        doc_id = str(doc.pop('_id'))
        index, doc_type = self._index_and_mapping(namespace)

        # make sure that elasticsearch treats it like a file
        if not self.has_attachment_mapping:
            body = {
                "properties": {
                    self.attachment_field: {"type": "attachment"}
                }
            }
            self.elastic.indices.put_mapping(index=index,
                                             doc_type=doc_type,
                                             body=body)
            self.has_attachment_mapping = True

        metadata = {
            'ns': namespace,
            '_ts': timestamp,
        }

        doc = self._formatter.format_document(doc)
        doc[self.attachment_field] = base64.b64encode(f.read()).decode()

        parent_id = self._get_parent_id(doc_type, doc)
        #LOG.error(" namespace: %r, doc_type: %s, doc %r, parentid: %s" % (namespace, doc_type, doc.keys(),parent_id))
        if parent_id is None:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=doc, id=doc_id,
                               refresh=(self.auto_commit_interval == 0))
        else:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=doc, id=doc_id, parent=parent_id,
                               refresh=(self.auto_commit_interval == 0))

        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        if doc_type in self.routing and 'parentField' in self.routing[doc_type]:
            # We can't use delete() directly here and have to do a full search first.
            # This is due to the fact that Elasticsearch needs the parent ID to
            # know where to route the delete request. We might not have the parent
            # ID available in our remove request though.
            document = self._search_doc_by_id(index, doc_type, document_id)
            if document is None:
                LOG.error('Could not find document with ID "%s" in Elasticsearch to apply remove', u(document_id))
                return
            parent_id = self._get_parent_id(doc_type, document)
            self.elastic.delete(index=index, doc_type=doc_type,
                                id=u(document_id), parent=parent_id,
                                refresh=(self.auto_commit_interval == 0))
        else:
            self.elastic.delete(index=index, doc_type=doc_type,
                                id=u(document_id),
                                refresh=(self.auto_commit_interval == 0))

        self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic, query=kwargs.pop('body', None),
                        scroll='10m', **kwargs):
            hit['_source']['_id'] = hit['_id']
            if '_parent' in hit:
                hit['_source']['_parent'] = hit['_parent']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "_ts": {"gte": start_ts, "lte": end_ts}
                            }
                        }
                    }
                }
            })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts": "desc"}],
                },
                size=1
            )["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #23

Show file

File: neo4j_doc_manager.py Project: alibahsisoglu/neo4j_doc_manager

class DocManager(DocManagerBase):
  """
  Neo4j implementation for the DocManager. Receives documents and 
  communicates with Neo4j Server.
  """

  def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
    
    self.graph = Graph(url)
    self.auto_commit_interval = auto_commit_interval
    self.unique_key = unique_key
    self.chunk_size = chunk_size
    self._formatter = DefaultDocumentFormatter()
    self.kwargs = kwargs.get("clientOptions")

  def apply_id_constraint(self, doc_types):
    for doc_type in doc_types:
      constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d._id IS UNIQUE".format(doc_type=doc_type)
      self.graph.cypher.execute(constraint)

  def stop(self):
    """Stop the auto-commit thread."""
    self.auto_commit_interval = None
  
  @wrap_exceptions
  def upsert(self, doc, namespace, timestamp):
    """Inserts a document into Neo4j."""
    index, doc_type = self._index_and_mapping(namespace)
    doc_id = u(doc.pop("_id"))
    metadata = { "_ts": timestamp }
    doc = self._formatter.format_document(doc)
    builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
    self.apply_id_constraint(builder.doc_types)
    tx = self.graph.cypher.begin()
    for statement in builder.query_nodes.keys():
      tx.append(statement, builder.query_nodes[statement])
    for relationship in builder.relationships_query.keys():
      tx.append(relationship, builder.relationships_query[relationship])
    tx.commit()

  @wrap_exceptions
  def bulk_upsert(self, docs, namespace, timestamp):
    def iterate_chunks():
        more_chunks = True

        while more_chunks:
            tx = self.graph.cypher.begin()
            metadata = { "_ts": timestamp }
            for i in range(self.chunk_size):
                try:
                    doc = next(docs)
                    index, doc_type = self._index_and_mapping(namespace)
                    doc_id = u(doc.pop("_id"))
                    doc = self._formatter.format_document(doc)
                    builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
                    self.apply_id_constraint(builder.doc_types)
                    for statement in builder.query_nodes.keys():
                        tx.append(statement, builder.query_nodes[statement])
                    for relationship in builder.relationships_query.keys():
                        tx.append(relationship, builder.relationships_query[relationship])
                except StopIteration:
                    more_chunks = False
                    if i > 0:
                        yield tx
                    break
            if more_chunks:
                yield tx

    for tx in iterate_chunks():
        tx.commit()

  @wrap_exceptions
  def update(self, document_id, update_spec, namespace, timestamp):
    doc_id = u(document_id)
    tx = self.graph.cypher.begin()
    index, doc_type = self._index_and_mapping(namespace)
    updater = NodesAndRelationshipsUpdater()
    updater.run_update(update_spec, doc_id, doc_type)
    for statement in updater.statements_with_params:
      for key in statement.keys():
        tx.append(key, statement[key])
    tx.commit()

  @wrap_exceptions
  def remove(self, document_id, namespace, timestamp):
    """Removes a document from Neo4j."""
    doc_id = u(document_id)
    index, doc_type = self._index_and_mapping(namespace)
    params_dict = {"doc_id": doc_id}
    tx = self.graph.cypher.begin()
    statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r"
    tx.append(statement, params_dict)
    tx.commit()

  @wrap_exceptions
  def search(self, start_ts, end_ts):
    statement = "MATCH (d:Document) WHERE d._ts>={start_ts} AND d._ts<={end_ts} RETURN d".format(start_ts=start_ts, end_ts=end_ts)
    results = self.graph.cypher.execute(statement)
    return results


  def commit(self):
    LOG.error("Commit")
    

  @wrap_exceptions
  def get_last_doc(self):
    """Get the most recently modified node from Neo4j.
    This method is used to help define a time window within which documents
    may be in conflict after a MongoDB rollback.
    """
    LOG.error("Commit")    

    
  def handle_command(self, doc, namespace, timestamp):
    db = namespace.split('.', 1)[0]

  def _index_and_mapping(self, namespace):
    """Helper method for getting the index and type from a namespace."""
    index, doc_type = namespace.split('.', 1)
    return index.lower(), doc_type

Example #24

Show file

File: neo4j_doc_manager.py Project: alibahsisoglu/neo4j_doc_manager

class DocManager(DocManagerBase):
    """
  Neo4j implementation for the DocManager. Receives documents and 
  communicates with Neo4j Server.
  """

    def __init__(
        self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, **kwargs
    ):

        self.graph = Graph(url)
        self.auto_commit_interval = auto_commit_interval
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self._formatter = DefaultDocumentFormatter()
        self.kwargs = kwargs.get("clientOptions")

    def apply_id_constraint(self, doc_types):
        for doc_type in doc_types:
            constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d._id IS UNIQUE".format(doc_type=doc_type)
            self.graph.cypher.execute(constraint)

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Inserts a document into Neo4j."""
        index, doc_type = self._index_and_mapping(namespace)
        doc_id = u(doc.pop("_id"))
        metadata = {"_ts": timestamp}
        doc = self._formatter.format_document(doc)
        builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
        self.apply_id_constraint(builder.doc_types)
        tx = self.graph.cypher.begin()
        for statement in builder.query_nodes.keys():
            tx.append(statement, builder.query_nodes[statement])
        for relationship in builder.relationships_query.keys():
            tx.append(relationship, builder.relationships_query[relationship])
        tx.commit()

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        def iterate_chunks():
            more_chunks = True

            while more_chunks:
                tx = self.graph.cypher.begin()
                metadata = {"_ts": timestamp}
                for i in range(self.chunk_size):
                    try:
                        doc = next(docs)
                        index, doc_type = self._index_and_mapping(namespace)
                        doc_id = u(doc.pop("_id"))
                        doc = self._formatter.format_document(doc)
                        builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
                        self.apply_id_constraint(builder.doc_types)
                        for statement in builder.query_nodes.keys():
                            tx.append(statement, builder.query_nodes[statement])
                        for relationship in builder.relationships_query.keys():
                            tx.append(relationship, builder.relationships_query[relationship])
                    except StopIteration:
                        more_chunks = False
                        if i > 0:
                            yield tx
                        break
                if more_chunks:
                    yield tx

        for tx in iterate_chunks():
            tx.commit()

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        doc_id = u(document_id)
        tx = self.graph.cypher.begin()
        index, doc_type = self._index_and_mapping(namespace)
        updater = NodesAndRelationshipsUpdater()
        updater.run_update(update_spec, doc_id, doc_type)
        for statement in updater.statements_with_params:
            for key in statement.keys():
                tx.append(key, statement[key])
        tx.commit()

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Removes a document from Neo4j."""
        doc_id = u(document_id)
        index, doc_type = self._index_and_mapping(namespace)
        params_dict = {"doc_id": doc_id}
        tx = self.graph.cypher.begin()
        statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r"
        tx.append(statement, params_dict)
        tx.commit()

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        statement = "MATCH (d:Document) WHERE d._ts>={start_ts} AND d._ts<={end_ts} RETURN d".format(
            start_ts=start_ts, end_ts=end_ts
        )
        results = self.graph.cypher.execute(statement)
        return results

    def commit(self):
        LOG.error("Commit")

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified node from Neo4j.
    This method is used to help define a time window within which documents
    may be in conflict after a MongoDB rollback.
    """
        LOG.error("Commit")

    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split(".", 1)[0]

    def _index_and_mapping(self, namespace):
        """Helper method for getting the index and type from a namespace."""
        index, doc_type = namespace.split(".", 1)
        return index.lower(), doc_type

Example #25

Show file

File: elastic_doc_manager.py Project: MicroFocus/mongo-connector

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
        self.elastic = Elasticsearch(hosts=[url])
        self.auto_commit_interval = auto_commit_interval
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    @wrap_exceptions
    def handle_command(self, doc, namespace_set):
        """Handle database and other command operations"""
        logging.debug ("ES:handle_command")
	
        if namespace_set:
            db, cmd_ns = doc['ns'].split(".", 1)
            coll = doc['drop']
            if coll not in [None, ""]:
                index = db+"."+coll
                if index in namespace_set:
                    logging.debug ("ES: received drop for " + index)
                    self.elastic.indices.delete(index)
	
    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        document = self.elastic.get(index=doc['ns'],
                                    id=str(doc['_id']))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        self.upsert(updated)
        return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Insert a document into Elasticsearch."""
        doc_type = self.doc_type
        index = doc['ns']
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        self.elastic.index(index=index, doc_type=doc_type,
                           body=self._formatter.format_document(doc), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Don't mutate doc argument
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                index = doc["ns"]
                doc_id = str(doc.pop("_id"))
                yield {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    logging.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def remove(self, doc):
        """Remove a document from Elasticsearch."""
        self.elastic.delete(index=doc['ns'], doc_type=self.doc_type,
                            id=str(doc["_id"]),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic, query=kwargs.pop('body', None),
                        scroll='10m', **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index="_all",
            body={
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "_ts": {"gte": start_ts, "lte": end_ts}
                            }
                        }
                    }
                }
            })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index="_all",
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts": "desc"}],
                },
                size=1
            )["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #26

Show file

File: elastic_doc_manager.py Project: gwecho/mongo-connector

class DocManager(DocManagerBase):
    """Elasticsearch implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions on
    Elasticsearch.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 **kwargs):
        self.elastic = Elasticsearch(hosts=[url])
        self.auto_commit_interval = auto_commit_interval
        self.doc_type = 'string'  # default type is string, change if needed
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

    def stop(self):
        """Stop the auto-commit thread."""
        self.auto_commit_interval = None

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        document = self.elastic.get(index=doc['ns'],
                                    id=str(doc['_id']))
        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        # Add metadata fields back into updated, for the purposes of
        # calling upsert(). Need to do this until these become separate
        # arguments in 2.x
        updated['ns'] = doc['ns']
        updated['_ts'] = doc['_ts']
        self.upsert(updated)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Insert a document into Elasticsearch."""
        doc_type = self.doc_type
        index = doc.pop('ns')
        # No need to duplicate '_id' in source document
        doc_id = str(doc.pop("_id"))
        metadata = {
            "ns": index,
            "_ts": doc.pop("_ts")
        }
        # Index the source document
        self.elastic.index(index=index, doc_type=doc_type,
                           body=self._formatter.format_document(doc), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Index document metadata
        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index = doc.pop("ns")
                doc_id = str(doc.pop("_id"))
                timestamp = doc.pop("_ts")
                document_action = {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    logging.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def remove(self, doc):
        """Remove a document from Elasticsearch."""
        self.elastic.delete(index=doc['ns'], doc_type=self.doc_type,
                            id=str(doc["_id"]),
                            refresh=(self.auto_commit_interval == 0))
        self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                            id=str(doc["_id"]),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results."""
        for hit in scan(self.elastic, query=kwargs.pop('body', None),
                        scroll='10m', **kwargs):
            hit['_source']['_id'] = hit['_id']
            yield hit['_source']

    def search(self, start_ts, end_ts):
        """Query Elasticsearch for documents in a time range.

        This method is used to find documents that may be in conflict during
        a rollback event in MongoDB.
        """
        return self._stream_search(
            index=self.meta_index_name,
            body={
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "_ts": {"gte": start_ts, "lte": end_ts}
                            }
                        }
                    }
                }
            })

    def commit(self):
        """Refresh all Elasticsearch indexes."""
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commit to the Elastic server."""
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Get the most recently modified document from Elasticsearch.

        This method is used to help define a time window within which documents
        may be in conflict after a MongoDB rollback.
        """
        try:
            result = self.elastic.search(
                index=self.meta_index_name,
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts": "desc"}],
                },
                size=1
            )["hits"]["hits"]
            for r in result:
                r['_source']['_id'] = r['_id']
                return r['_source']
        except es_exceptions.RequestError:
            # no documents so ES returns 400 because of undefined _ts mapping
            return None

Example #27

Show file

File: kafka_doc_manager.py Project: grindthemall/kafka_doc_manager

class DocManager(DocManagerBase):
    """Kafka implementation of the DocManager interface.

    Receives documents from an OplogThread and takes the appropriate actions
    to send to Kafka
    """
    def __init__(self, url, unique_id='_id', **kwargs):
        self.kafkaprod = KafkaProducer(
            client_id='mongotokafka-producer-mconnect',
            bootstrap_servers=[url])
        print("__init__ ran")
        print(str(self.kafkaprod.config))
        self.unique_key = unique_id
        self._formatter = DefaultDocumentFormatter()

    def _topic_and_mapping(self, namespace):
        """Helper method for getting the topic from a namespace."""
        topic_prefix, topic = namespace.split('.', 1)
        return topic_prefix + "_" + topic

    def stop(self):
        #logging.log(info,"Closing Kafka Broker")
        logging.info("Closing Kafka Broker")
        self.kafkaprod.close()

    def apply_update(self, doc, update_spec):
        if "$set" not in update_spec and "$unset" not in update_spec:
            # Don't try to add ns and _ts fields back in from doc
            return update_spec
        return super(DocManager, self).apply_update(doc, update_spec)

    def handle_command(self, doc, namespace, timestamp):
        db = namespace.split('.', 1)[0]
        if doc.get('dropDatabase'):
            raise errors.OperationFailed(
                "kafka_doc_manager does not currently support deleting a topic"
            )

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "kafka_doc_manager does not support renaming topics.")

        if doc.get('create'):
            db, coll = self.command_helper.map_collection(db, doc['create'])
            if db and coll:
                # if a MongoDB dbs is created, create Kafka topic for it
                # TODO
                pass

        if doc.get('drop'):
            db, coll = self.command_helper.map_collection(db, doc['drop'])
            if db and coll:
                # if a MongoDB collection is deleted, delete Kafka topic
                # TODO
                pass

    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        #
        self.commit()
        self.upsert(updated, namespace, timestamp)
        return updated

    def upsert(self, doc, namespace, timestamp):
        """Insert a document into a Kafka topic."""
        #Use Kafka Synchronous method to insert individual record.
        topic = self._topic_and_mapping(namespace)
        # Send document to Kafka with appropriate topic setting
        d_fixed = self._formatter.format_document(doc)
        doc_fixed = dumps(d_fixed)
        futureprod = self.kafkaprod.send(topic, str(doc_fixed))
        # commit right away making sure kafka buffer is empty
        self.commit()
        #try:
        #    record_metadata = futureprod.get(timeout=10)
        #    logging.log(info,record_metadata.offset)
        #except KafkaError:
        #    logging.exception("Kafka single upsert failed")
        #    pass

    def bulk_upsert(self, docs, namespace, timestamp):
        #Insert multiple documents into Kafka topics.
        # Make calls to Kafka.send async non-blocking
        # create loop to read through "docs" and send each one to the buffer
        for doc in docs:
            topic = self._topic_and_mapping(namespace)
            d_fixed = self._formatter.format_document(doc)
            doc_fixed = dumps(d_fixed)
            futureprod = self.kafkaprod.send(topic, str(doc_fixed))

        self.commit()

    def insert_file(self, f, namespace, timestamp):
        # Not implemented for Kafka
        pass

    def remove(self, document_id, namespace, timestamp):
        #Kafka does not allow deletion of random messages/offsets in a topic
        pass

    def _stream_search(self, *args, **kwargs):
        # Kafka does not allow searching for specific values in the topic
        pass

    def search(self, start_ts, end_ts):
        # Kafka does not allow searching of topics
        pass

    def commit(self):
        # Kafka does not normally require commits, but the flush command can temporarily block to empty a buffer
        self.kafkaprod.flush()

    def run_auto_commit(self):
        # Kafka does not normally require commits, but the flush command can temporarily block to empty a buffer
        self.kafkaprod.flush()

    def get_last_doc(self):
        # While we could pull the last doc from Kafka, rollbacks in Mongo
        # will replay docs into the given Kafka topic, the _id should
        # allow syncing for any consumers
        pass