Esempio n. 1
0
 def __init__(self, url, **kwargs):
     """ Verify URL and establish a connection.
     """
     try:
         self.mongo = pymongo.MongoClient(url)
     except pymongo.errors.InvalidURI:
         raise errors.ConnectionFailed("Invalid URI for MongoDB")
     except pymongo.errors.ConnectionFailure:
         raise errors.ConnectionFailed("Failed to connect to MongoDB")
     self.namespace_set = kwargs.get("namespace_set")
Esempio n. 2
0
 def __init__(self, url, **kwargs):
     """ Verify URL and establish a connection.
     """
     try:
         self.mongo = pymongo.MongoClient(
             url, **kwargs.get('clientOptions', {}))
     except pymongo.errors.InvalidURI:
         raise errors.ConnectionFailed("Invalid URI for MongoDB")
     except pymongo.errors.ConnectionFailure:
         raise errors.ConnectionFailed("Failed to connect to MongoDB")
     self.namespace_set = kwargs.get("namespace_set")
     self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK)
Esempio n. 3
0
    def __init__(self, url, **kwargs):
        """ Verify URL and establish a connection.
        """
        username = kwargs.pop('username', '')
        password = kwargs.pop('password', '')
        passwordFile = kwargs.pop('passwordFile', '')
        if passwordFile:
            with open(passwordFile, 'r') as f:
                password = f.read().strip()

        # ingest username and password into mongodb url,
        # assume mongodb url likes 'mongodb://[username:password@]hosts/[db]'
        if username and password:
            url = url.replace('mongodb://',
                              'mongodb://{}:{}@'.format(username, password))

        try:
            self.mongo = pymongo.MongoClient(url,
                                             **kwargs.get('clientOptions', {}))
        except pymongo.errors.InvalidURI:
            raise errors.ConnectionFailed("Invalid URI for MongoDB")
        except pymongo.errors.ConnectionFailure:
            raise errors.ConnectionFailed("Failed to connect to MongoDB")
        self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK)
        self.use_single_meta_collection = kwargs.get(
            'use_single_meta_collection', False)
        self.meta_collection_name = kwargs.get(
            'meta_collection_name', constants.DEFAULT_META_COLLECTION_NAME)
        self.meta_collection_cap_size = kwargs.get(
            'meta_collection_cap_size',
            constants.DEFAULT_META_COLLECTION_CAP_SIZE)

        # The '_id' field has to be unique, so if we will be writing data from
        # different namespaces into single collection, we use a different field
        # for storing the document id.
        self.id_field = 'doc_id' if self.use_single_meta_collection else '_id'
        self.meta_database = self.mongo["__mongo_connector"]

        # Create the meta collection as capped if a single meta collection is
        # preferred
        if self.use_single_meta_collection:
            if (self.meta_collection_name
                    not in self.meta_database.collection_names()):
                self.meta_database.create_collection(
                    self.meta_collection_name,
                    capped=True,
                    size=self.meta_collection_cap_size)
                meta_collection = self.meta_database[self.meta_collection_name]
                meta_collection.create_index(self.id_field)
                meta_collection.create_index([('ns', 1), ('_ts', 1)])

        # record the namespace which has index on 'updateId'
        self.nsIndexedUpdateId = set()
Esempio n. 4
0
 def __init__(self, url, **kwargs):
     """ Verify URL and establish a connection.
     """
     try:
         self.mongo = pymongo.MongoClient(url)
     except pymongo.errors.InvalidURI:
         raise errors.ConnectionFailed("Invalid URI for MongoDB")
     except pymongo.errors.ConnectionFailure:
         raise errors.ConnectionFailed("Failed to connect to MongoDB")
     self.namespace_set = kwargs.get("namespace_set")
     for namespace in self._namespaces():
         self.mongo["__mongo_connector"][namespace].create_index("_ts")
    def __init__(self, url, **kwargs):
        """ Verify URL and establish a connection.
        """
        try:
#            self.mongo = pymongo.MongoClient(url, **kwargs.get('clientOptions', {}))
            self.mongo = pymongo.MongoClient(url, **kwargs.get('clientOptions', {}))
        except pymongo.errors.InvalidURI:
            raise errors.ConnectionFailed("Invalid URI for MongoDB")
        except pymongo.errors.ConnectionFailure:
            raise errors.ConnectionFailed("Failed to connect to MongoDB")
        self.namespace_set = kwargs.get("namespace_set")
        self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK)
        self.use_single_meta_collection = kwargs.get(
            'use_single_meta_collection',
            False)
        self.meta_collection_name = kwargs.get(
            'meta_collection_name',
            constants.DEFAULT_META_COLLECTION_NAME)
        self.meta_collection_cap_size = kwargs.get(
            'meta_collection_cap_size',
            constants.DEFAULT_META_COLLECTION_CAP_SIZE)

        # The '_id' field has to be unique, so if we will be writing data from
        # different namespaces into single collection, we use a different field
        # for storing the document id.
        self.id_field = 'doc_id' if self.use_single_meta_collection else '_id'
        self.meta_database = self.mongo["__mongo_connector"]

        # Create the meta collection as capped if a single meta collection is
        # preferred
        if self.use_single_meta_collection:
            if (self.meta_collection_name not in
                    self.meta_database.collection_names()):
                self.meta_database.create_collection(
                    self.meta_collection_name,
                    capped=True,
                    size=self.meta_collection_cap_size)
                meta_collection = self.meta_database[self.meta_collection_name]
                meta_collection.create_index(self.id_field)
                meta_collection.create_index([('ns', 1), ('_ts', 1)])
 def _remove(self):
     """For test purposes only. Removes all documents in test.test
     """
     try:
         self.elastic.delete_by_query(index="test.test",
                                      doc_type=self.doc_type,
                                      body={"match_all": {}})
     except (es_exceptions.ConnectionError):
         raise errors.ConnectionFailed(
             "Could not connect to Elastic Search")
     except es_exceptions.TransportError:
         raise errors.OperationFailed("Could not remove test documents")
     self.commit()
    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        try:
            self.elastic.delete(index=doc['ns'],
                                doc_type=self.doc_type,
                                id=str(doc[self.unique_key]),
                                refresh=(self.auto_commit_interval == 0))
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed("Could not remove document: %s" %
                                         (bsjson.dumps(doc)))
    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """
        try:
            result = self.elastic.search(index="_all",
                                         body={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "_ts": "desc"
                                             }]
                                         },
                                         size=1)["hits"]["hits"]
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not retrieve last document from Elastic Search")

        return result[0]["_source"] if len(result) > 0 else None
    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Elastic

        docs may be any iterable
        """
        def docs_to_upsert():
            doc = None
            for doc in docs:
                index = doc["ns"]
                doc[self.unique_key] = str(doc[self.unique_key])
                doc_id = doc[self.unique_key]
                yield {
                    "index": {
                        "_index": index,
                        "_type": self.doc_type,
                        "_id": doc_id
                    }
                }
                yield doc
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            self.elastic.bulk(doc_type=self.doc_type,
                              body=docs_to_upsert(),
                              refresh=(self.auto_commit_interval == 0))
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not bulk-insert documents into Elastic")
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
 def _stream_search(self, *args, **kwargs):
     """Helper method for iterating over ES search results"""
     try:
         first_response = self.elastic.search(*args,
                                              search_type="scan",
                                              scroll="10m",
                                              size=100,
                                              **kwargs)
         scroll_id = first_response.get("_scroll_id")
         expected_count = first_response.get("hits", {}).get("total", 0)
         results_returned = 0
         while results_returned < expected_count:
             next_response = self.elastic.scroll(scroll_id=scroll_id,
                                                 scroll="10m")
             results_returned += len(next_response["hits"]["hits"])
             for doc in next_response["hits"]["hits"]:
                 yield doc["_source"]
     except (es_exceptions.ConnectionError):
         raise errors.ConnectionFailed(
             "Could not connect to Elastic Search")
     except es_exceptions.TransportError:
         raise errors.OperationFailed(
             "Could not retrieve documents from Elastic Search")
    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """
        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc["_id"])
        doc_id = doc[self.unique_key]
        try:
            self.elastic.index(index=index,
                               doc_type=doc_type,
                               body=bsjson.dumps(doc),
                               id=doc_id,
                               refresh=(self.auto_commit_interval == 0))
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed("Could not index document: %s" %
                                         (bsjson.dumps(doc)))