def __init__(self, url, **kwargs): """ Verify URL and establish a connection. """ try: self.mongo = pymongo.MongoClient(url) except pymongo.errors.InvalidURI: raise errors.ConnectionFailed("Invalid URI for MongoDB") except pymongo.errors.ConnectionFailure: raise errors.ConnectionFailed("Failed to connect to MongoDB") self.namespace_set = kwargs.get("namespace_set")
def __init__(self, url, **kwargs): """ Verify URL and establish a connection. """ try: self.mongo = pymongo.MongoClient( url, **kwargs.get('clientOptions', {})) except pymongo.errors.InvalidURI: raise errors.ConnectionFailed("Invalid URI for MongoDB") except pymongo.errors.ConnectionFailure: raise errors.ConnectionFailed("Failed to connect to MongoDB") self.namespace_set = kwargs.get("namespace_set") self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK)
def __init__(self, url, **kwargs): """ Verify URL and establish a connection. """ username = kwargs.pop('username', '') password = kwargs.pop('password', '') passwordFile = kwargs.pop('passwordFile', '') if passwordFile: with open(passwordFile, 'r') as f: password = f.read().strip() # ingest username and password into mongodb url, # assume mongodb url likes 'mongodb://[username:password@]hosts/[db]' if username and password: url = url.replace('mongodb://', 'mongodb://{}:{}@'.format(username, password)) try: self.mongo = pymongo.MongoClient(url, **kwargs.get('clientOptions', {})) except pymongo.errors.InvalidURI: raise errors.ConnectionFailed("Invalid URI for MongoDB") except pymongo.errors.ConnectionFailure: raise errors.ConnectionFailed("Failed to connect to MongoDB") self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK) self.use_single_meta_collection = kwargs.get( 'use_single_meta_collection', False) self.meta_collection_name = kwargs.get( 'meta_collection_name', constants.DEFAULT_META_COLLECTION_NAME) self.meta_collection_cap_size = kwargs.get( 'meta_collection_cap_size', constants.DEFAULT_META_COLLECTION_CAP_SIZE) # The '_id' field has to be unique, so if we will be writing data from # different namespaces into single collection, we use a different field # for storing the document id. self.id_field = 'doc_id' if self.use_single_meta_collection else '_id' self.meta_database = self.mongo["__mongo_connector"] # Create the meta collection as capped if a single meta collection is # preferred if self.use_single_meta_collection: if (self.meta_collection_name not in self.meta_database.collection_names()): self.meta_database.create_collection( self.meta_collection_name, capped=True, size=self.meta_collection_cap_size) meta_collection = self.meta_database[self.meta_collection_name] meta_collection.create_index(self.id_field) meta_collection.create_index([('ns', 1), ('_ts', 1)]) # record the namespace which has index on 'updateId' self.nsIndexedUpdateId = set()
def __init__(self, url, **kwargs): """ Verify URL and establish a connection. """ try: self.mongo = pymongo.MongoClient(url) except pymongo.errors.InvalidURI: raise errors.ConnectionFailed("Invalid URI for MongoDB") except pymongo.errors.ConnectionFailure: raise errors.ConnectionFailed("Failed to connect to MongoDB") self.namespace_set = kwargs.get("namespace_set") for namespace in self._namespaces(): self.mongo["__mongo_connector"][namespace].create_index("_ts")
def __init__(self, url, **kwargs): """ Verify URL and establish a connection. """ try: # self.mongo = pymongo.MongoClient(url, **kwargs.get('clientOptions', {})) self.mongo = pymongo.MongoClient(url, **kwargs.get('clientOptions', {})) except pymongo.errors.InvalidURI: raise errors.ConnectionFailed("Invalid URI for MongoDB") except pymongo.errors.ConnectionFailure: raise errors.ConnectionFailed("Failed to connect to MongoDB") self.namespace_set = kwargs.get("namespace_set") self.chunk_size = kwargs.get('chunk_size', constants.DEFAULT_MAX_BULK) self.use_single_meta_collection = kwargs.get( 'use_single_meta_collection', False) self.meta_collection_name = kwargs.get( 'meta_collection_name', constants.DEFAULT_META_COLLECTION_NAME) self.meta_collection_cap_size = kwargs.get( 'meta_collection_cap_size', constants.DEFAULT_META_COLLECTION_CAP_SIZE) # The '_id' field has to be unique, so if we will be writing data from # different namespaces into single collection, we use a different field # for storing the document id. self.id_field = 'doc_id' if self.use_single_meta_collection else '_id' self.meta_database = self.mongo["__mongo_connector"] # Create the meta collection as capped if a single meta collection is # preferred if self.use_single_meta_collection: if (self.meta_collection_name not in self.meta_database.collection_names()): self.meta_database.create_collection( self.meta_collection_name, capped=True, size=self.meta_collection_cap_size) meta_collection = self.meta_database[self.meta_collection_name] meta_collection.create_index(self.id_field) meta_collection.create_index([('ns', 1), ('_ts', 1)])
def _remove(self): """For test purposes only. Removes all documents in test.test """ try: self.elastic.delete_by_query(index="test.test", doc_type=self.doc_type, body={"match_all": {}}) except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed("Could not remove test documents") self.commit()
def remove(self, doc): """Removes documents from Elastic The input is a python dictionary that represents a mongo document. """ try: self.elastic.delete(index=doc['ns'], doc_type=self.doc_type, id=str(doc[self.unique_key]), refresh=(self.auto_commit_interval == 0)) except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed("Could not remove document: %s" % (bsjson.dumps(doc)))
def get_last_doc(self): """Returns the last document stored in the Elastic engine. """ try: result = self.elastic.search(index="_all", body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }] }, size=1)["hits"]["hits"] except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed( "Could not retrieve last document from Elastic Search") return result[0]["_source"] if len(result) > 0 else None
def bulk_upsert(self, docs): """Update or insert multiple documents into Elastic docs may be any iterable """ def docs_to_upsert(): doc = None for doc in docs: index = doc["ns"] doc[self.unique_key] = str(doc[self.unique_key]) doc_id = doc[self.unique_key] yield { "index": { "_index": index, "_type": self.doc_type, "_id": doc_id } } yield doc if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: self.elastic.bulk(doc_type=self.doc_type, body=docs_to_upsert(), refresh=(self.auto_commit_interval == 0)) except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed( "Could not bulk-insert documents into Elastic") except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass
def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results""" try: first_response = self.elastic.search(*args, search_type="scan", scroll="10m", size=100, **kwargs) scroll_id = first_response.get("_scroll_id") expected_count = first_response.get("hits", {}).get("total", 0) results_returned = 0 while results_returned < expected_count: next_response = self.elastic.scroll(scroll_id=scroll_id, scroll="10m") results_returned += len(next_response["hits"]["hits"]) for doc in next_response["hits"]["hits"]: yield doc["_source"] except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed( "Could not retrieve documents from Elastic Search")
def upsert(self, doc): """Update or insert a document into Elastic If you'd like to have different types of document in your database, you can store the doc type as a field in Mongo and set doc_type to that field. (e.g. doc_type = doc['_type']) """ doc_type = self.doc_type index = doc['ns'] doc[self.unique_key] = str(doc["_id"]) doc_id = doc[self.unique_key] try: self.elastic.index(index=index, doc_type=doc_type, body=bsjson.dumps(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed("Could not index document: %s" % (bsjson.dumps(doc)))