def reconnect(self): """ Reconnect to the database and rebuild indices if necessary. Users should typically not have to call this method. """ db_connection = getDbConnection() self.database = db_connection.get_default_database() self.collection = MongoProxy(self.database[self.name]) for index in self._indices: if isinstance(index, (list, tuple)): self.collection.ensure_index(index[0], **index[1]) else: self.collection.ensure_index(index) if type(self._textIndex) is dict: textIdx = [(k, 'text') for k in self._textIndex.keys()] try: self.collection.ensure_index( textIdx, weights=self._textIndex, default_language=self._textLanguage) except pymongo.errors.OperationFailure: print( TerminalColor.warning('WARNING: Text search not enabled.'))
def __init__(self): self.name = None self._indices = [] self._textIndex = None self._textLanguage = None self._filterKeys = { AccessType.READ: set(), AccessType.WRITE: set(), AccessType.ADMIN: set(), AccessType.SITE_ADMIN: set() } self.initialize() db_connection = getDbConnection() self.database = db_connection.get_default_database() self.collection = MongoProxy(self.database[self.name]) for index in self._indices: if isinstance(index, (list, tuple)): self.collection.ensure_index(index[0], **index[1]) else: self.collection.ensure_index(index) if type(self._textIndex) is dict: textIdx = [(k, 'text') for k in self._textIndex.keys()] try: self.collection.ensure_index( textIdx, weights=self._textIndex, default_language=self._textLanguage) except pymongo.errors.OperationFailure: print( TerminalColor.warning('WARNING: Text search not enabled.'))
def getDbConnection(uri=None, replicaSet=None): """ Get a MongoClient object that is connected to the configured database. We lazy-instantiate a module-level singleton, the MongoClient objects manage their own connection pools internally. :param uri: if specified, connect to this mongo db rather than the one in the config. :param replicaSet: if uri is specified, use this replica set. """ global _dbClients origKey = (uri, replicaSet) if origKey in _dbClients: return _dbClients[origKey] if uri is None or uri == '': dbConf = getDbConfig() uri = dbConf.get('uri') replicaSet = dbConf.get('replica_set') clientOptions = { 'connectTimeoutMS': 15000, # This is the maximum time between when we fetch data from a cursor. # If it times out, the cursor is lost and we can't reconnect. If it # isn't set, we have issues with replica sets when the primary goes # down. This value can be overridden in the mongodb uri connection # string with the socketTimeoutMS. 'socketTimeoutMS': 60000, } if uri is None: dbUriRedacted = 'mongodb://*****:*****@') if len(parts) == 2: dbUriRedacted = 'mongodb://' + parts[1] else: dbUriRedacted = uri if replicaSet: client = pymongo.MongoReplicaSetClient( uri, replicaSet=replicaSet, read_preference=ReadPreference.SECONDARY_PREFERRED, **clientOptions) else: client = pymongo.MongoClient(uri, **clientOptions) client = MongoProxy(client, logger=logger) _dbClients[origKey] = _dbClients[(uri, replicaSet)] = client desc = '' if replicaSet: desc += ', replica set: %s' % replicaSet print( TerminalColor.info('Connected to MongoDB: %s%s' % (dbUriRedacted, desc))) return client
def setUp(self): """ Set up the mongo db for the external dataset, with 3 collections: a) tweetsgeo, which has tweet data that is geolocated (lat/long fields). b) polyGeoIndexed, w/2 polygons in a 2dsphere-indexed 'geometry' field c) polyGeoIndeces, same as above but without the 2dsphere index """ super(MongoDatasetTestCase, self).setUp() self._user = self.model('user').createUser( 'minervauser', 'password', 'minerva', 'user', '*****@*****.**') from girder.utility import config dbUri = config.getConfig()['database']['uri'] self.dbName = 'minerva_test_external_mongo_dataset' dbUriParts = dbUri.split('/')[0:-1] self.dbUri = '/'.join(dbUriParts + [self.dbName]) from girder.models import getDbConnection self.externalMongoDbConnection = getDbConnection(self.dbUri) self.externalMongoDb = self.externalMongoDbConnection.get_default_database() from girder.external.mongodb_proxy import MongoProxy self.geojsonIndexedName = 'polyGeoIndexed' self.geojsonNonIndexedName = 'polyGeoNonIndexed' self.polyIndexedCollection = MongoProxy(self.externalMongoDb[self.geojsonIndexedName]) self.polyNonIndexedCollection = MongoProxy(self.externalMongoDb[self.geojsonNonIndexedName]) self.pluginTestDir = os.path.dirname(os.path.realpath(__file__)) geojsonPath = os.path.join(self.pluginTestDir, 'data', 'polygons.json') with open(geojsonPath) as geojsonFile: polys = json.load(geojsonFile) for poly in polys: self.polyIndexedCollection.save(poly) self.polyNonIndexedCollection.save(poly) self.polyIndexedCollection.create_index([('geometry', '2dsphere')]) self.collectionName = 'tweetsgeo' self.tweetsgeoCollection = MongoProxy(self.externalMongoDb[self.collectionName]) # add test data to external dataset self.pluginTestDir = os.path.dirname(os.path.realpath(__file__)) tweets100Path = os.path.join(self.pluginTestDir, 'data', 'tweets100.json') z = zipfile.ZipFile('%s.zip' % tweets100Path) tweets = json.load(z.open('tweets100.json')) from datetime import datetime dateformat = '%Y-%m-%dT%H:%M:%S' for tweet in tweets: d = datetime.strptime((tweet['created_at']), dateformat) tweet['created_at'] = int((d - datetime(1970, 1, 1)).total_seconds()) self.tweetsgeoCollection.save(tweet)
def mongoCollection(self, connectionUri, collectionName): # TODO not sure if this is a good idea to do this db stuff here # maybe this suggests a new model? from girder.models import getDbConnection dbConn = getDbConnection(connectionUri) db = dbConn.get_default_database() from girder.external.mongodb_proxy import MongoProxy collection = MongoProxy(db[collectionName]) return collection
def __init__(self, assetstore): """ :param assetstore: The assetstore to act on. """ super(GridFsAssetstoreAdapter, self).__init__(assetstore) recent = False try: # Guard in case the connectionArgs is unhashable key = (self.assetstore.get('mongohost'), self.assetstore.get('replicaset'), self.assetstore.get('shard')) if key in _recentConnections: recent = (time.time() - _recentConnections[key]['created'] < RECENT_CONNECTION_CACHE_TIME) except TypeError: key = None try: # MongoClient automatically reuses connections from a pool, but we # want to avoid redoing ensureChunkIndices each time we get such a # connection. client = getDbConnection(self.assetstore.get('mongohost'), self.assetstore.get('replicaset'), quiet=recent) self.chunkColl = MongoProxy(client[self.assetstore['db']].chunk) if not recent: _ensureChunkIndices(self.chunkColl) if self.assetstore.get('shard') == 'auto': _setupSharding(self.chunkColl) if key is not None: if len(_recentConnections) >= RECENT_CONNECTION_CACHE_MAX_SIZE: _recentConnections.clear() _recentConnections[key] = { 'created': time.time() } except pymongo.errors.ConnectionFailure: logger.error('Failed to connect to GridFS assetstore %s', self.assetstore['db']) self.chunkColl = 'Failed to connect' self.unavailable = True except pymongo.errors.ConfigurationError: logger.exception('Failed to configure GridFS assetstore %s', self.assetstore['db']) self.chunkColl = 'Failed to configure' self.unavailable = True
def setUp(self): """ Set up the mongo db for the external dataset, with a collection named tweetsgeo, which have tweet data that is geolocated. """ super(ExternalMongoDatasetTestCase, self).setUp() self._user = self.model('user').createUser( 'minervauser', 'password', 'minerva', 'user', '*****@*****.**') from girder.utility import config dbUri = config.getConfig()['database']['uri'] self.dbName = 'minerva_test_external_mongo_dataset' dbUriParts = dbUri.split('/')[0:-1] self.dbUri = '/'.join(dbUriParts + [self.dbName]) from girder.models import getDbConnection self.externalMongoDbConnection = getDbConnection(self.dbUri) self.externalMongoDb = self.externalMongoDbConnection.get_default_database() from girder.external.mongodb_proxy import MongoProxy self.collectionName = 'tweetsgeo' self.tweetsgeoCollection = MongoProxy(self.externalMongoDb[self.collectionName]) # add test data to external dataset self.pluginTestDir = os.path.dirname(os.path.realpath(__file__)) tweets100Path = os.path.join(self.pluginTestDir, 'data', 'tweets100.json') z = zipfile.ZipFile('%s.zip' % tweets100Path) tweets = json.load(z.open('tweets100.json')) from datetime import datetime dateformat = '%Y-%m-%dT%H:%M:%S' for tweet in tweets: d = datetime.strptime((tweet['created_at']), dateformat) tweet['created_at'] = int((d - datetime(1970, 1, 1)).total_seconds()) self.tweetsgeoCollection.save(tweet) path = '/minerva_dataset/folder' params = { 'userId': self._user['_id'], } # create a dataset folder self.request(path=path, method='POST', params=params, user=self._user)
def reconnect(self): """ Reconnect to the database and rebuild indices if necessary. Users should typically not have to call this method. """ db_connection = getDbConnection() self.database = db_connection.get_default_database() self.collection = MongoProxy(self.database[self.name]) for index in self._indices: if isinstance(index, (list, tuple)): self.collection.create_index(index[0], **index[1]) else: self.collection.create_index(index) if isinstance(self._textIndex, dict): textIdx = [(k, 'text') for k in six.viewkeys(self._textIndex)] try: self.collection.create_index( textIdx, weights=self._textIndex, default_language=self._textLanguage) except pymongo.errors.OperationFailure: logprint.warning('WARNING: Text search not enabled.')
class Model(ModelImporter): """ Model base class. Models are responsible for abstracting away the persistence layer. Each collection in the database should have its own model. Methods that deal with database interaction belong in the model layer. """ def __init__(self): self.name = None self._indices = [] self._textIndex = None self._textLanguage = None self._filterKeys = { AccessType.READ: set(), AccessType.WRITE: set(), AccessType.ADMIN: set(), AccessType.SITE_ADMIN: set() } self.initialize() self.reconnect() def reconnect(self): """ Reconnect to the database and rebuild indices if necessary. Users should typically not have to call this method. """ db_connection = getDbConnection() self.database = db_connection.get_default_database() self.collection = MongoProxy(self.database[self.name]) for index in self._indices: if isinstance(index, (list, tuple)): self.collection.ensure_index(index[0], **index[1]) else: self.collection.ensure_index(index) if type(self._textIndex) is dict: textIdx = [(k, 'text') for k in self._textIndex.keys()] try: self.collection.ensure_index( textIdx, weights=self._textIndex, default_language=self._textLanguage) except pymongo.errors.OperationFailure: print( TerminalColor.warning('WARNING: Text search not enabled.')) def exposeFields(self, level, fields): """ Expose model fields to users with the given access level. Subclasses should call this in their initialize method to declare what fields should be exposed to what access levels if they are using the default filter implementation in this class. Since filtered fields are sets, this method is idempotent. :param level: The required access level for the field. :type level: AccessType :param fields: A field or list of fields to expose for that level. :type fields: str, list, or tuple """ if isinstance(fields, six.string_types): fields = (fields, ) self._filterKeys[level] = self._filterKeys[level].union(fields) def hideFields(self, level, fields): """ Hide a field, i.e. make sure it is not exposed via the default filtering method. Since the filter uses a white list, it is only ever necessary to call this for fields that were added previously with exposeFields(). :param level: The access level to remove the fields from. :type level: AccessType :param fields: The field or fields to remove from the white list. :type fields: str, list, or tuple """ if isinstance(fields, six.string_types): fields = (fields, ) self._filterKeys[level] = self._filterKeys[level].difference(fields) def filter(self, doc, user=None, additionalKeys=None): """ Filter this model for the given user. This is a default implementation that assumes this model has no notion of access control, and simply allows all keys under READ access level, and conditionally allows any keys assigned to SITE_ADMIN level. :param doc: The document of this model type to be filtered. :type doc: dict or None :param user: The current user for whom we are filtering. :type user: dict or None :param additionalKeys: Any additional keys that should be included in the document for this call only. :type additionalKeys: list, tuple, or None :returns: The filtered document (dict). """ if doc is None: return None keys = self._filterKeys[AccessType.READ] if user and user.get('admin') is True: keys = keys.union(self._filterKeys[AccessType.SITE_ADMIN]) if additionalKeys: keys = keys.union(additionalKeys) return self.filterDocument(doc, allow=tuple(keys)) def ensureTextIndex(self, index, language='english'): """ Call this during initialize() of the subclass if you want your model to have a full-text searchable index. Each collection may have zero or one full-text index. :param language: The default_language value for the text index, which is used for stemming and stop words. If the text index should not use stemming and stop words, set this param to 'none'. :type language: str """ self._textIndex = index self._textLanguage = language def ensureIndices(self, indices): """ Subclasses should call this with a list of strings representing fields that should be indexed in the database if there are any. Otherwise, it is not necessary to call this method. Elements of the list may also be a list or tuple, where the second element is a dictionary that will be passed as kwargs to the pymongo ensure_index call. """ self._indices.extend(indices) def ensureIndex(self, index): """ Like ensureIndices, but declares just a single index rather than a list of them. """ self._indices.append(index) def validate(self, doc): """ Models should implement this to validate the document before it enters the database. It must return the document with any necessary filters applied, or throw a ValidationException if validation of the document fails. :param doc: The document to validate before saving to the collection. :type doc: dict """ raise Exception('Must override validate() in %s model.' % self.__class__.__name__) # pragma: no cover def initialize(self): """ Subclasses should override this and set the name of the collection as self.name. Also, they should set any indexed fields that they require. """ raise Exception('Must override initialize() in %s model' % self.__class__.__name__) # pragma: no cover def find(self, query=None, offset=0, limit=0, **kwargs): """ Search the collection by a set of parameters. Passes any kwargs through to the underlying pymongo.collection.find function. :param query: The search query (see general MongoDB docs for "find()") :type query: dict :param offset: The offset into the results :type offset: int :param limit: Maximum number of documents to return :type limit: int :param sort: The sort order. :type sort: List of (key, order) tuples. :param fields: A mask for filtering result documents by key. :type fields: List of strings :returns: A pymongo database cursor. """ if not query: query = {} if 'timeout' not in kwargs: kwargs['timeout'] = False return self.collection.find(spec=query, skip=offset, limit=limit, **kwargs) def findOne(self, query=None, **kwargs): """ Search the collection by a set of parameters. Passes any kwargs through to the underlying pymongo.collection.find function. :param query: The search query (see general MongoDB docs for "find()") :type query: dict :param sort: The sort order. :type sort: List of (key, order) tuples. :param fields: A mask for filtering result documents by key. :type fields: List of strings :returns: the first object that was found, or None if none found. """ if not query: query = {} return self.collection.find_one(query, **kwargs) def textSearch(self, query, offset=0, limit=0, sort=None, fields=None, filters=None): """ Perform a full-text search against the text index for this collection. :param query: The text query. Will be stemmed internally. :type query: str :param filters: Any additional query operators to apply. :type filters: dict :returns: A pymongo cursor. It is left to the caller to build the results from the cursor. """ if not filters: filters = {} if not fields: fields = {} fields['_textScore'] = {'$meta': 'textScore'} filters['$text'] = {'$search': query} cursor = self.find(filters, offset=offset, limit=limit, sort=sort, fields=fields) # Sort by meta text score, but only if result count is below a certain # threshold. The text score is not a real index, so we cannot always # sort by it if there is a high number of matching documents. if cursor.count() < TEXT_SCORE_SORT_MAX and sort is None: cursor.sort([('_textScore', {'$meta': 'textScore'})]) return cursor def save(self, document, validate=True, triggerEvents=True): """ Create or update a document in the collection. This triggers two events; one prior to validation, and one prior to saving. Either of these events may have their default action prevented. :param document: The document to save. :type document: dict :param validate: Whether to call the model's validate() before saving. :type validate: bool :param triggerEvents: Whether to trigger events for validate and pre- and post-save hooks. """ if validate and triggerEvents: event = events.trigger('.'.join(('model', self.name, 'validate')), document) if event.defaultPrevented: validate = False if validate: document = self.validate(document) if triggerEvents: event = events.trigger('model.{}.save'.format(self.name), document) if event.defaultPrevented: return document sendCreateEvent = ('_id' not in document) document['_id'] = self.collection.save(document) if triggerEvents: if sendCreateEvent: events.trigger('model.{}.save.created'.format(self.name), document) events.trigger('model.{}.save.after'.format(self.name), document) return document def update(self, query, update, multi=True): """ This method should be used for updating multiple documents in the collection. This is useful for things like removing all references in this collection to a document that is being deleted from another collection. This is a thin wrapper around pymongo db.collection.update(). For updating a single document, use the save() model method instead. :param query: The query for finding documents to update. It's the same format as would be passed to find(). :type query: dict :param update: The update specifier. :type update: dict """ self.collection.update(query, update, multi=multi) def increment(self, query, field, amount, **kwargs): """ This is a specialization of the update method that atomically increments a field by a given amount. Additional kwargs are passed directly through to update. :param query: The query selector for documents to update. :type query: dict :param field: The name of the field in the document to increment. :type field: str :param amount: The amount to increment the field by. :type amount: int or float """ self.update(query=query, update={'$inc': {field: amount}}, **kwargs) def remove(self, document, **kwargs): """ Delete an object from the collection; must have its _id set. :param doc: the item to remove. """ assert '_id' in document event = events.trigger('.'.join(('model', self.name, 'remove')), document) kwargsEvent = events.trigger( '.'.join(('model', self.name, 'remove_with_kwargs')), { 'document': document, 'kwargs': kwargs }) if not event.defaultPrevented and not kwargsEvent.defaultPrevented: return self.collection.remove({'_id': document['_id']}) def removeWithQuery(self, query): """ Remove all documents matching a given query from the collection. For safety reasons, you may not pass an empty query. """ assert query return self.collection.remove(query) def load(self, id, objectId=True, fields=None, exc=False): """ Fetch a single object from the database using its _id field. :param id: The value for searching the _id field. :type id: string or ObjectId :param objectId: Whether the id should be coerced to ObjectId type. :type objectId: bool :param fields: Fields list to include. Also can be a dict for exclusion. See pymongo docs for how to use this arg. :param exc: Whether to raise a ValidationException if there is no document with the given id. :type exc: bool :returns: The matching document, or None. """ if not id: raise Exception('Attempt to load null ObjectId: %s' % id) if objectId and type(id) is not ObjectId: try: id = ObjectId(id) except Exception: raise ValidationException('Invalid ObjectId: {}'.format(id), field='id') doc = self.findOne({'_id': id}, fields=fields) if doc is None and exc is True: raise ValidationException('No such {}: {}'.format(self.name, id), field='id') return doc def filterDocument(self, doc, allow=None): """ This method will filter the given document to make it suitable to output to the user. :param doc: The document to filter. :type doc: dict :param allow: The whitelist of fields to allow in the output document. :type allow: List of strings """ if not allow: allow = [] if doc is None: return None out = {} for field in allow: if field in doc: out[field] = doc[field] if '_textScore' in doc: out['_textScore'] = doc['_textScore'] out['_modelType'] = self.name return out def subtreeCount(self, doc): """ Return the size of the subtree rooted at the given document. In general, if this contains items or folders, it will be the count of the items and folders in all containers. If it does not, it will be 1. This returns the absolute size of the subtree, it does not filter by permissions. :param doc: The root of the subtree. :type doc: dict """ return 1
class GridFsAssetstoreAdapter(AbstractAssetstoreAdapter): """ This assetstore type stores files within MongoDB using the GridFS data model. """ @staticmethod def validateInfo(doc): """ Validate the assetstore -- make sure we can connect to it and that the necessary indexes are set up. """ if not doc.get('db', ''): raise ValidationException('Database name must not be empty.', 'db') if '.' in doc['db'] or ' ' in doc['db']: raise ValidationException( 'Database name cannot contain spaces' ' or periods.', 'db') try: chunkColl = getDbConnection( doc.get('mongohost'), doc.get('replicaset'), autoRetry=False, serverSelectionTimeoutMS=10000)[doc['db']].chunk _ensureChunkIndices(chunkColl) except pymongo.errors.ServerSelectionTimeoutError as e: raise ValidationException('Could not connect to the database: %s' % str(e)) return doc @staticmethod def fileIndexFields(): return ['sha512', 'chunkUuid'] def __init__(self, assetstore): """ :param assetstore: The assetstore to act on. """ super(GridFsAssetstoreAdapter, self).__init__(assetstore) recent = False try: # Guard in case the connectionArgs is unhashable key = (self.assetstore.get('mongohost'), self.assetstore.get('replicaset'), self.assetstore.get('shard')) if key in _recentConnections: recent = (time.time() - _recentConnections[key]['created'] < RECENT_CONNECTION_CACHE_TIME) except TypeError: key = None try: # MongoClient automatically reuses connections from a pool, but we # want to avoid redoing ensureChunkIndices each time we get such a # connection. client = getDbConnection(self.assetstore.get('mongohost'), self.assetstore.get('replicaset'), quiet=recent) self.chunkColl = MongoProxy(client[self.assetstore['db']].chunk) if not recent: _ensureChunkIndices(self.chunkColl) if self.assetstore.get('shard') == 'auto': _setupSharding(self.chunkColl) if key is not None: if len(_recentConnections ) >= RECENT_CONNECTION_CACHE_MAX_SIZE: _recentConnections.clear() _recentConnections[key] = {'created': time.time()} except pymongo.errors.ConnectionFailure: logger.error('Failed to connect to GridFS assetstore %s', self.assetstore['db']) self.chunkColl = 'Failed to connect' self.unavailable = True except pymongo.errors.ConfigurationError: logger.exception('Failed to configure GridFS assetstore %s', self.assetstore['db']) self.chunkColl = 'Failed to configure' self.unavailable = True def initUpload(self, upload): """ Creates a UUID that will be used to uniquely link each chunk to """ upload['chunkUuid'] = uuid.uuid4().hex upload['sha512state'] = _hash_state.serializeHex(sha512()) return upload def uploadChunk(self, upload, chunk): """ Stores the uploaded chunk in fixed-sized pieces in the chunks collection of this assetstore's database. """ # If we know the chunk size is too large or small, fail early. self.checkUploadSize(upload, self.getChunkSize(chunk)) if isinstance(chunk, six.text_type): chunk = chunk.encode('utf8') if isinstance(chunk, six.binary_type): chunk = BytesIO(chunk) # Restore the internal state of the streaming SHA-512 checksum checksum = _hash_state.restoreHex(upload['sha512state'], 'sha512') # TODO: when saving uploads is optional, we can conditionally try to # fetch the last chunk. Add these line before `lastChunk = ...`: # lastChunk = None # if '_id' in upload or upload['received'] != 0: lastChunk = self.chunkColl.find_one({'uuid': upload['chunkUuid']}, projection=['n'], sort=[('n', pymongo.DESCENDING)]) if lastChunk: # This bit of code will only do anything if there is a discrepancy # between the received count of the upload record and the length of # the file stored as chunks in the database. This code updates the # sha512 state with the difference before reading the bytes sent # from the user. if self.requestOffset(upload) > upload['received']: # This isn't right -- the last received amount may not be a # complete chunk. cursor = self.chunkColl.find( { 'uuid': upload['chunkUuid'], 'n': { '$gte': upload['received'] // CHUNK_SIZE } }, projection=['data']).sort('n', pymongo.ASCENDING) for result in cursor: checksum.update(result['data']) n = lastChunk['n'] + 1 if lastChunk else 0 size = 0 startingN = n while upload['received'] + size < upload['size']: data = chunk.read(CHUNK_SIZE) if not data: break # If a timeout occurs while we are trying to load data, we might # have succeeded, in which case we will get a DuplicateKeyError # when it automatically retries. Therefore, log this error but # don't stop. try: self.chunkColl.insert_one({ 'n': n, 'uuid': upload['chunkUuid'], 'data': bson.binary.Binary(data) }) except pymongo.errors.DuplicateKeyError: logger.info( 'Received a DuplicateKeyError while uploading, ' 'probably because we reconnected to the database ' '(chunk uuid %s part %d)', upload['chunkUuid'], n) n += 1 size += len(data) checksum.update(data) chunk.close() try: self.checkUploadSize(upload, size) except ValidationException: # The user tried to upload too much or too little. Delete # everything we added self.chunkColl.delete_many({ 'uuid': upload['chunkUuid'], 'n': { '$gte': startingN } }) raise # Persist the internal state of the checksum upload['sha512state'] = _hash_state.serializeHex(checksum) upload['received'] += size return upload def requestOffset(self, upload): """ The offset will be the CHUNK_SIZE * total number of chunks in the database for this file. We return the max of that and the received count because in testing mode we are uploading chunks that are smaller than the CHUNK_SIZE, which in practice will not work. """ lastChunk = self.chunkColl.find_one({'uuid': upload['chunkUuid']}, projection=['n'], sort=[('n', pymongo.DESCENDING)]) if lastChunk is None: offset = 0 else: offset = lastChunk['n'] * CHUNK_SIZE return max(offset, upload['received']) def finalizeUpload(self, upload, file): """ Grab the final state of the checksum and set it on the file object, and write the generated UUID into the file itself. """ hash = _hash_state.restoreHex(upload['sha512state'], 'sha512').hexdigest() file['sha512'] = hash file['chunkUuid'] = upload['chunkUuid'] file['chunkSize'] = CHUNK_SIZE return file def downloadFile(self, file, offset=0, headers=True, endByte=None, contentDisposition=None, extraParameters=None, **kwargs): """ Returns a generator function that will be used to stream the file from the database to the response. """ if endByte is None or endByte > file['size']: endByte = file['size'] if headers: setResponseHeader('Accept-Ranges', 'bytes') self.setContentHeaders(file, offset, endByte, contentDisposition) # If the file is empty, we stop here if endByte - offset <= 0: return lambda: '' n = 0 chunkOffset = 0 # We must "seek" to the correct chunk index and local offset if offset > 0: n = offset // file['chunkSize'] chunkOffset = offset % file['chunkSize'] cursor = self.chunkColl.find( { 'uuid': file['chunkUuid'], 'n': { '$gte': n } }, projection=['data']).sort('n', pymongo.ASCENDING) def stream(): co = chunkOffset # Can't assign to outer scope without "nonlocal" position = offset shouldBreak = False for chunk in cursor: chunkLen = len(chunk['data']) if position + chunkLen - co > endByte: chunkLen = endByte - position + co shouldBreak = True yield chunk['data'][co:chunkLen] if shouldBreak: break position += chunkLen - co if co > 0: co = 0 return stream def deleteFile(self, file): """ Delete all of the chunks in the collection that correspond to the given file. """ q = { 'chunkUuid': file['chunkUuid'], 'assetstoreId': self.assetstore['_id'] } matching = File().find(q, limit=2, projection=[]) if matching.count(True) == 1: # If we can't reach the database, we return anyway. A system check # will be necessary to remove the abandoned file. Since we already # can handle that case, tell Mongo to use a 0 write concern -- we # don't need to know that the chunks have been deleted, and this # can be faster. try: self.chunkColl.with_options(write_concern=pymongo.WriteConcern( w=0)).delete_many({'uuid': file['chunkUuid']}) except pymongo.errors.AutoReconnect: pass def cancelUpload(self, upload): """ Delete all of the chunks associated with a given upload. """ self.chunkColl.delete_many({'uuid': upload['chunkUuid']})
class GridFsAssetstoreAdapter(AbstractAssetstoreAdapter): """ This assetstore type stores files within MongoDB using the GridFS data model. """ @staticmethod def validateInfo(doc): """ Validate the assetstore -- make sure we can connect to it and that the necessary indexes are set up. """ if not doc.get('db', ''): raise ValidationException('Database name must not be empty.', 'db') if '.' in doc['db'] or ' ' in doc['db']: raise ValidationException('Database name cannot contain spaces' ' or periods.', 'db') try: chunkColl = getDbConnection( doc.get('mongohost'), doc.get('replicaset'), autoRetry=False, serverSelectionTimeoutMS=10000)[doc['db']].chunk _ensureChunkIndices(chunkColl) except pymongo.errors.ServerSelectionTimeoutError as e: raise ValidationException( 'Could not connect to the database: %s' % str(e)) return doc @staticmethod def fileIndexFields(): return ['sha512', 'chunkUuid'] def __init__(self, assetstore): """ :param assetstore: The assetstore to act on. """ super(GridFsAssetstoreAdapter, self).__init__(assetstore) recent = False try: # Guard in case the connectionArgs is unhashable key = (self.assetstore.get('mongohost'), self.assetstore.get('replicaset'), self.assetstore.get('shard')) if key in _recentConnections: recent = (time.time() - _recentConnections[key]['created'] < RECENT_CONNECTION_CACHE_TIME) except TypeError: key = None try: # MongoClient automatically reuses connections from a pool, but we # want to avoid redoing ensureChunkIndices each time we get such a # connection. client = getDbConnection(self.assetstore.get('mongohost'), self.assetstore.get('replicaset'), quiet=recent) self.chunkColl = MongoProxy(client[self.assetstore['db']].chunk) if not recent: _ensureChunkIndices(self.chunkColl) if self.assetstore.get('shard') == 'auto': _setupSharding(self.chunkColl) if key is not None: if len(_recentConnections) >= RECENT_CONNECTION_CACHE_MAX_SIZE: _recentConnections.clear() _recentConnections[key] = { 'created': time.time() } except pymongo.errors.ConnectionFailure: logger.error('Failed to connect to GridFS assetstore %s', self.assetstore['db']) self.chunkColl = 'Failed to connect' self.unavailable = True except pymongo.errors.ConfigurationError: logger.exception('Failed to configure GridFS assetstore %s', self.assetstore['db']) self.chunkColl = 'Failed to configure' self.unavailable = True def initUpload(self, upload): """ Creates a UUID that will be used to uniquely link each chunk to """ upload['chunkUuid'] = uuid.uuid4().hex upload['sha512state'] = hash_state.serializeHex(sha512()) return upload def uploadChunk(self, upload, chunk): """ Stores the uploaded chunk in fixed-sized pieces in the chunks collection of this assetstore's database. """ # If we know the chunk size is too large or small, fail early. self.checkUploadSize(upload, self.getChunkSize(chunk)) if isinstance(chunk, six.text_type): chunk = chunk.encode('utf8') if isinstance(chunk, six.binary_type): chunk = BytesIO(chunk) # Restore the internal state of the streaming SHA-512 checksum checksum = hash_state.restoreHex(upload['sha512state'], 'sha512') # TODO: when saving uploads is optional, we can conditionally try to # fetch the last chunk. Add these line before `lastChunk = ...`: # lastChunk = None # if '_id' in upload or upload['received'] != 0: lastChunk = self.chunkColl.find_one({ 'uuid': upload['chunkUuid'] }, projection=['n'], sort=[('n', pymongo.DESCENDING)]) if lastChunk: # This bit of code will only do anything if there is a discrepancy # between the received count of the upload record and the length of # the file stored as chunks in the database. This code updates the # sha512 state with the difference before reading the bytes sent # from the user. if self.requestOffset(upload) > upload['received']: # This isn't right -- the last received amount may not be a # complete chunk. cursor = self.chunkColl.find({ 'uuid': upload['chunkUuid'], 'n': {'$gte': upload['received'] // CHUNK_SIZE} }, projection=['data']).sort('n', pymongo.ASCENDING) for result in cursor: checksum.update(result['data']) n = lastChunk['n'] + 1 if lastChunk else 0 size = 0 startingN = n while upload['received']+size < upload['size']: data = chunk.read(CHUNK_SIZE) if not data: break # If a timeout occurs while we are trying to load data, we might # have succeeded, in which case we will get a DuplicateKeyError # when it automatically retries. Therefore, log this error but # don't stop. try: self.chunkColl.insert_one({ 'n': n, 'uuid': upload['chunkUuid'], 'data': bson.binary.Binary(data) }) except pymongo.errors.DuplicateKeyError: logger.info('Received a DuplicateKeyError while uploading, ' 'probably because we reconnected to the database ' '(chunk uuid %s part %d)', upload['chunkUuid'], n) n += 1 size += len(data) checksum.update(data) chunk.close() try: self.checkUploadSize(upload, size) except ValidationException: # The user tried to upload too much or too little. Delete # everything we added self.chunkColl.delete_many({ 'uuid': upload['chunkUuid'], 'n': {'$gte': startingN} }) raise # Persist the internal state of the checksum upload['sha512state'] = hash_state.serializeHex(checksum) upload['received'] += size return upload def requestOffset(self, upload): """ The offset will be the CHUNK_SIZE * total number of chunks in the database for this file. We return the max of that and the received count because in testing mode we are uploading chunks that are smaller than the CHUNK_SIZE, which in practice will not work. """ lastChunk = self.chunkColl.find_one({ 'uuid': upload['chunkUuid'] }, projection=['n'], sort=[('n', pymongo.DESCENDING)]) if lastChunk is None: offset = 0 else: offset = lastChunk['n'] * CHUNK_SIZE return max(offset, upload['received']) def finalizeUpload(self, upload, file): """ Grab the final state of the checksum and set it on the file object, and write the generated UUID into the file itself. """ hash = hash_state.restoreHex(upload['sha512state'], 'sha512').hexdigest() file['sha512'] = hash file['chunkUuid'] = upload['chunkUuid'] file['chunkSize'] = CHUNK_SIZE return file def downloadFile(self, file, offset=0, headers=True, endByte=None, contentDisposition=None, extraParameters=None, **kwargs): """ Returns a generator function that will be used to stream the file from the database to the response. """ if endByte is None or endByte > file['size']: endByte = file['size'] if headers: setResponseHeader('Accept-Ranges', 'bytes') self.setContentHeaders(file, offset, endByte, contentDisposition) # If the file is empty, we stop here if endByte - offset <= 0: return lambda: '' n = 0 chunkOffset = 0 # We must "seek" to the correct chunk index and local offset if offset > 0: n = offset // file['chunkSize'] chunkOffset = offset % file['chunkSize'] cursor = self.chunkColl.find({ 'uuid': file['chunkUuid'], 'n': {'$gte': n} }, projection=['data']).sort('n', pymongo.ASCENDING) def stream(): co = chunkOffset # Can't assign to outer scope without "nonlocal" position = offset shouldBreak = False for chunk in cursor: chunkLen = len(chunk['data']) if position + chunkLen - co > endByte: chunkLen = endByte - position + co shouldBreak = True yield chunk['data'][co:chunkLen] if shouldBreak: break position += chunkLen - co if co > 0: co = 0 return stream def deleteFile(self, file): """ Delete all of the chunks in the collection that correspond to the given file. """ q = { 'chunkUuid': file['chunkUuid'], 'assetstoreId': self.assetstore['_id'] } matching = File().find(q, limit=2, projection=[]) if matching.count(True) == 1: # If we can't reach the database, we return anyway. A system check # will be necessary to remove the abandoned file. Since we already # can handle that case, tell Mongo to use a 0 write concern -- we # don't need to know that the chunks have been deleted, and this # can be faster. try: self.chunkColl.with_options( write_concern=pymongo.WriteConcern(w=0)).delete_many( {'uuid': file['chunkUuid']}) except pymongo.errors.AutoReconnect: pass def cancelUpload(self, upload): """ Delete all of the chunks associated with a given upload. """ self.chunkColl.delete_many({'uuid': upload['chunkUuid']})
class Model(ModelImporter): """ Model base class. Models are responsible for abstracting away the persistence layer. Each collection in the database should have its own model. Methods that deal with database interaction belong in the model layer. """ def __init__(self): self.name = None self._indices = [] self._textIndex = None self._textLanguage = None self.prefixSearchFields = ('lowerName', 'name') self._filterKeys = { AccessType.READ: set(), AccessType.WRITE: set(), AccessType.ADMIN: set(), AccessType.SITE_ADMIN: set() } self.initialize() self.reconnect() def reconnect(self): """ Reconnect to the database and rebuild indices if necessary. Users should typically not have to call this method. """ db_connection = getDbConnection() self.database = db_connection.get_default_database() self.collection = MongoProxy(self.database[self.name]) for index in self._indices: if isinstance(index, (list, tuple)): self.collection.create_index(index[0], **index[1]) else: self.collection.create_index(index) if type(self._textIndex) is dict: textIdx = [(k, 'text') for k in six.viewkeys(self._textIndex)] try: self.collection.create_index( textIdx, weights=self._textIndex, default_language=self._textLanguage) except pymongo.errors.OperationFailure: print( TerminalColor.warning('WARNING: Text search not enabled.')) def exposeFields(self, level, fields): """ Expose model fields to users with the given access level. Subclasses should call this in their initialize method to declare what fields should be exposed to what access levels if they are using the default filter implementation in this class. Since filtered fields are sets, this method is idempotent. :param level: The required access level for the field. :type level: AccessType :param fields: A field or list of fields to expose for that level. :type fields: str, list, or tuple """ if isinstance(fields, six.string_types): fields = (fields, ) self._filterKeys[level].update(fields) def hideFields(self, level, fields): """ Hide a field, i.e. make sure it is not exposed via the default filtering method. Since the filter uses a white list, it is only ever necessary to call this for fields that were added previously with exposeFields(). :param level: The access level to remove the fields from. :type level: AccessType :param fields: The field or fields to remove from the white list. :type fields: str, list, or tuple """ if isinstance(fields, six.string_types): fields = (fields, ) self._filterKeys[level].difference_update(fields) def filter(self, doc, user=None, additionalKeys=None): """ Filter this model for the given user. This is a default implementation that assumes this model has no notion of access control, and simply allows all keys under READ access level, and conditionally allows any keys assigned to SITE_ADMIN level. :param doc: The document of this model type to be filtered. :type doc: dict or None :param user: The current user for whom we are filtering. :type user: dict or None :param additionalKeys: Any additional keys that should be included in the document for this call only. :type additionalKeys: list, tuple, set, or None :returns: The filtered document (dict). """ if doc is None: return None keys = set(self._filterKeys[AccessType.READ]) if user and user.get('admin') is True: keys.update(self._filterKeys[AccessType.SITE_ADMIN]) if additionalKeys: keys.update(additionalKeys) return self.filterDocument(doc, allow=keys) def ensureTextIndex(self, index, language='english'): """ Call this during initialize() of the subclass if you want your model to have a full-text searchable index. Each collection may have zero or one full-text index. :param language: The default_language value for the text index, which is used for stemming and stop words. If the text index should not use stemming and stop words, set this param to 'none'. :type language: str """ self._textIndex = index self._textLanguage = language def ensureIndices(self, indices): """ Subclasses should call this with a list of strings representing fields that should be indexed in the database if there are any. Otherwise, it is not necessary to call this method. Elements of the list may also be a list or tuple, where the second element is a dictionary that will be passed as kwargs to the pymongo create_index call. """ self._indices.extend(indices) def ensureIndex(self, index): """ Like ensureIndices, but declares just a single index rather than a list of them. """ self._indices.append(index) def validate(self, doc): """ Models should implement this to validate the document before it enters the database. It must return the document with any necessary filters applied, or throw a ValidationException if validation of the document fails. :param doc: The document to validate before saving to the collection. :type doc: dict """ raise Exception('Must override validate() in %s model.' % self.__class__.__name__) # pragma: no cover def initialize(self): """ Subclasses should override this and set the name of the collection as self.name. Also, they should set any indexed fields that they require. """ raise Exception('Must override initialize() in %s model' % self.__class__.__name__) # pragma: no cover def find(self, query=None, offset=0, limit=0, timeout=None, fields=None, sort=None, **kwargs): """ Search the collection by a set of parameters. Passes any extra kwargs through to the underlying pymongo.collection.find function. :param query: The search query (see general MongoDB docs for "find()") :type query: dict :param offset: The offset into the results :type offset: int :param limit: Maximum number of documents to return :type limit: int :param sort: The sort order. :type sort: List of (key, order) tuples. :param fields: A mask for filtering result documents by key. :type fields: list[str] :param timeout: Cursor timeout in ms. Default is no timeout. :type timeout: int :returns: A pymongo database cursor. """ query = query or {} kwargs = {k: kwargs[k] for k in kwargs if k in _allowedFindArgs} cursor = self.collection.find( filter=query, skip=offset, limit=limit, projection=fields, no_cursor_timeout=timeout is None, sort=sort, **kwargs) if timeout: cursor.max_time_ms(timeout) return cursor def findOne(self, query=None, fields=None, **kwargs): """ Search the collection by a set of parameters. Passes any kwargs through to the underlying pymongo.collection.find_one function. :param query: The search query (see general MongoDB docs for "find()") :type query: dict :param sort: The sort order. :type sort: List of (key, order) tuples. :param fields: A mask for filtering result documents by key. :type fields: List of strings :returns: the first object that was found, or None if none found. """ query = query or {} kwargs = {k: kwargs[k] for k in kwargs if k in _allowedFindArgs} return self.collection.find_one(query, projection=fields, **kwargs) def textSearch(self, query, offset=0, limit=0, sort=None, fields=None, filters=None): """ Perform a full-text search against the text index for this collection. :param query: The text query. Will be stemmed internally. :type query: str :param filters: Any additional query operators to apply. :type filters: dict :returns: A pymongo cursor. It is left to the caller to build the results from the cursor. """ filters = filters or {} fields = fields or {} fields['_textScore'] = {'$meta': 'textScore'} filters['$text'] = {'$search': query} cursor = self.find(filters, offset=offset, limit=limit, sort=sort, fields=fields) # Sort by meta text score, but only if result count is below a certain # threshold. The text score is not a real index, so we cannot always # sort by it if there is a high number of matching documents. if cursor.count() < TEXT_SCORE_SORT_MAX and sort is None: cursor.sort([('_textScore', {'$meta': 'textScore'})]) return cursor def prefixSearch(self, query, offset=0, limit=0, sort=None, fields=None, filters=None, prefixSearchFields=None): """ Search for documents in this model's collection by a prefix string. The fields that will be searched based on this prefix must be set as the ``prefixSearchFields`` attribute of this model, which must be an iterable. Elements of this iterable must be either a string representing the field name, or a 2-tuple in which the first element is the field name, and the second element is a string representing the regex search options. :param query: The prefix string to look for. :type query: str :param filters: Any additional query operators to apply. :type filters: dict :param prefixSearchFields: To override the model's prefixSearchFields attribute for this invocation, pass an alternate iterable. :returns: A pymongo cursor. It is left to the caller to build the results from the cursor. """ filters = filters or {} filters['$or'] = filters.get('$or', []) for field in (prefixSearchFields or self.prefixSearchFields): if isinstance(field, (list, tuple)): filters['$or'].append({ field[0]: { '$regex': '^%s' % re.escape(query), '$options': field[1] } }) else: filters['$or'].append({ field: {'$regex': '^%s' % re.escape(query)} }) return self.find( filters, offset=offset, limit=limit, sort=sort, fields=fields) def save(self, document, validate=True, triggerEvents=True): """ Create or update a document in the collection. This triggers two events; one prior to validation, and one prior to saving. Either of these events may have their default action prevented. :param document: The document to save. :type document: dict :param validate: Whether to call the model's validate() before saving. :type validate: bool :param triggerEvents: Whether to trigger events for validate and pre- and post-save hooks. """ if validate and triggerEvents: event = events.trigger('.'.join(('model', self.name, 'validate')), document) if event.defaultPrevented: validate = False if validate: document = self.validate(document) if triggerEvents: event = events.trigger('model.%s.save' % self.name, document) if event.defaultPrevented: return document isNew = '_id' not in document try: if isNew: document['_id'] = \ self.collection.insert_one(document).inserted_id else: self.collection.replace_one( {'_id': document['_id']}, document, True) except WriteError as e: raise ValidationException('Database save failed: %s' % e.details) if triggerEvents: if isNew: events.trigger('model.%s.save.created' % self.name, document) events.trigger('model.%s.save.after' % self.name, document) return document def update(self, query, update, multi=True): """ This method should be used for updating multiple documents in the collection. This is useful for things like removing all references in this collection to a document that is being deleted from another collection. For updating a single document, use the save() model method instead. :param query: The query for finding documents to update. It's the same format as would be passed to find(). :type query: dict :param update: The update specifier. :type update: dict :param multi: Whether to update a single document, or all matching documents. :type multi: bool :returns: A pymongo UpdateResult object. """ if multi: return self.collection.update_many(query, update) else: return self.collection.update_one(query, update) def increment(self, query, field, amount, **kwargs): """ This is a specialization of the update method that atomically increments a field by a given amount. Additional kwargs are passed directly through to update. :param query: The query selector for documents to update. :type query: dict :param field: The name of the field in the document to increment. :type field: str :param amount: The amount to increment the field by. :type amount: int or float """ self.update(query=query, update={ '$inc': {field: amount} }, **kwargs) def remove(self, document, **kwargs): """ Delete an object from the collection; must have its _id set. :param doc: the item to remove. """ assert '_id' in document event = events.trigger('.'.join(('model', self.name, 'remove')), document) kwargsEvent = events.trigger( '.'.join(('model', self.name, 'remove_with_kwargs')), { 'document': document, 'kwargs': kwargs }) if not event.defaultPrevented and not kwargsEvent.defaultPrevented: return self.collection.delete_one({'_id': document['_id']}) def removeWithQuery(self, query): """ Remove all documents matching a given query from the collection. For safety reasons, you may not pass an empty query. """ assert query return self.collection.delete_many(query) def load(self, id, objectId=True, fields=None, exc=False): """ Fetch a single object from the database using its _id field. :param id: The value for searching the _id field. :type id: string or ObjectId :param objectId: Whether the id should be coerced to ObjectId type. :type objectId: bool :param fields: Fields list to include. Also can be a dict for exclusion. See pymongo docs for how to use this arg. :param exc: Whether to raise a ValidationException if there is no document with the given id. :type exc: bool :returns: The matching document, or None. """ if not id: raise ValidationException('Attempt to load null ObjectId: %s' % id) if objectId and type(id) is not ObjectId: try: id = ObjectId(id) except InvalidId: raise ValidationException('Invalid ObjectId: %s' % id, field='id') doc = self.findOne({'_id': id}, fields=fields) if doc is None and exc is True: raise ValidationException('No such %s: %s' % (self.name, id), field='id') return doc def filterDocument(self, doc, allow=None): """ This method will filter the given document to make it suitable to output to the user. :param doc: The document to filter. :type doc: dict :param allow: The whitelist of fields to allow in the output document. :type allow: List of strings """ if not allow: allow = [] if doc is None: return None out = {} for field in allow: if field in doc: out[field] = doc[field] if '_textScore' in doc: out['_textScore'] = doc['_textScore'] out['_modelType'] = self.name return out def subtreeCount(self, doc): """ Return the size of the subtree rooted at the given document. In general, if this contains items or folders, it will be the count of the items and folders in all containers. If it does not, it will be 1. This returns the absolute size of the subtree, it does not filter by permissions. :param doc: The root of the subtree. :type doc: dict """ return 1
class ExternalMongoDatasetTestCase(base.TestCase): """ Tests of the minerva external mongo dataset . """ def setUp(self): """ Set up the mongo db for the external dataset, with a collection named tweetsgeo, which have tweet data that is geolocated. """ super(ExternalMongoDatasetTestCase, self).setUp() self._user = self.model('user').createUser( 'minervauser', 'password', 'minerva', 'user', '*****@*****.**') from girder.utility import config dbUri = config.getConfig()['database']['uri'] self.dbName = 'minerva_test_external_mongo_dataset' dbUriParts = dbUri.split('/')[0:-1] self.dbUri = '/'.join(dbUriParts + [self.dbName]) from girder.models import getDbConnection self.externalMongoDbConnection = getDbConnection(self.dbUri) self.externalMongoDb = self.externalMongoDbConnection.get_default_database() from girder.external.mongodb_proxy import MongoProxy self.collectionName = 'tweetsgeo' self.tweetsgeoCollection = MongoProxy(self.externalMongoDb[self.collectionName]) # add test data to external dataset self.pluginTestDir = os.path.dirname(os.path.realpath(__file__)) tweets100Path = os.path.join(self.pluginTestDir, 'data', 'tweets100.json') z = zipfile.ZipFile('%s.zip' % tweets100Path) tweets = json.load(z.open('tweets100.json')) from datetime import datetime dateformat = '%Y-%m-%dT%H:%M:%S' for tweet in tweets: d = datetime.strptime((tweet['created_at']), dateformat) tweet['created_at'] = int((d - datetime(1970, 1, 1)).total_seconds()) self.tweetsgeoCollection.save(tweet) path = '/minerva_dataset/folder' params = { 'userId': self._user['_id'], } # create a dataset folder self.request(path=path, method='POST', params=params, user=self._user) def tearDown(self): self.externalMongoDbConnection.drop_database(self.dbName) def testExternalDataset(self): # create an external dataset from the mongo collection path = '/minerva_dataset/external_mongo_dataset' response = self.request( path=path, method='POST', user=self._user, params={ 'name': 'tweetsgeodataset', 'dbConnectionUri': self.dbUri, 'collectionName': self.collectionName } ) self.assertStatusOk(response) self.assertHasKeys(response.json, ['mongo_connection', 'json_row', 'original_type']) self.assertEquals(response.json['original_type'], 'mongo', 'expected mongo for original_type') self.assertEquals(response.json['mongo_connection']['collection_name'], self.collectionName, 'unexpected collection name') self.assertEquals(response.json['mongo_connection']['db_uri'], self.dbUri, 'unexpected db uri') minervaMetadata = response.json datasetId = minervaMetadata['dataset_id'] # update the minerva metadata with coordinate mapping minervaMetadata["mapper"] = { "latitudeKeypath": "coordinates.coordinates[1]", "longitudeKeypath": "coordinates.coordinates[0]", } path = '/item/{}/metadata'.format(datasetId) response = self.request( path=path, method='GET', user=self._user, ) metadata = response.json metadata['minerva'] = minervaMetadata response = self.request( path=path, method='PUT', user=self._user, body=json.dumps(metadata), type='application/json' ) metadata = response.json # create geojson in the dataset path = '/minerva_dataset/{}/geojson'.format(datasetId) response = self.request( path=path, method='POST', user=self._user, ) self.assertHasKeys(response.json, ['geojson']) # expect 100 points back as that is the size of the mongo dataset geojsonData = geojson.loads(response.json['geojson']['data']) # coordinate limits empirically figured # coords = [feature['geometry']['coordinates'] for feature in geojsonData['features']] # print min([c[0] for c in coords]) # print max([c[0] for c in coords]) # print min([c[1] for c in coords]) # print max([c[1] for c in coords]) xMin = -122.64 xMax = -57.93991735 yMin = -34.93523486 yMax = 47.696623 self.assertEquals(len(geojsonData['features']), 100, 'geojson should have 100 features') # to ensure correct mapping, check coords features = geojsonData['features'] for feature in features: coordinates = feature['geometry']['coordinates'] self.assertTrue(xMin <= coordinates[0], 'x coordinate out of range') self.assertTrue(xMax >= coordinates[0], 'x coordinate out of range') self.assertTrue(yMin <= coordinates[1], 'y coordinate out of range') self.assertTrue(yMax >= coordinates[1], 'y coordinate out of range') # test external_mongo_limits endpoint path = '/minerva_dataset/{}/external_mongo_limits'.format(datasetId) params = {'field': 'created_at'} response = self.request( path=path, method='GET', user=self._user, params=params ) limits = response.json['mongo_fields']['created_at'] self.assertEquals(limits['max'], 1380587461, 'incorrect max date') self.assertEquals(limits['min'], 1380587436, 'incorrect min date') # test limiting geojson to date range params = { 'dateField': 'created_at', 'startTime': 1380587440, 'endTime': 1380587455, } path = '/minerva_dataset/{}/geojson'.format(datasetId) response = self.request( path=path, method='POST', user=self._user, params=params ) self.assertEquals(response.json['geojson']['query_count'], 52, 'invalid query count')
class MongoDatasetTestCase(base.TestCase): """ Tests of the minerva mongo dataset . """ def setUp(self): """ Set up the mongo db for the external dataset, with 3 collections: a) tweetsgeo, which has tweet data that is geolocated (lat/long fields). b) polyGeoIndexed, w/2 polygons in a 2dsphere-indexed 'geometry' field c) polyGeoIndeces, same as above but without the 2dsphere index """ super(MongoDatasetTestCase, self).setUp() self._user = self.model('user').createUser( 'minervauser', 'password', 'minerva', 'user', '*****@*****.**') from girder.utility import config dbUri = config.getConfig()['database']['uri'] self.dbName = 'minerva_test_external_mongo_dataset' dbUriParts = dbUri.split('/')[0:-1] self.dbUri = '/'.join(dbUriParts + [self.dbName]) from girder.models import getDbConnection self.externalMongoDbConnection = getDbConnection(self.dbUri) self.externalMongoDb = self.externalMongoDbConnection.get_default_database() from girder.external.mongodb_proxy import MongoProxy self.geojsonIndexedName = 'polyGeoIndexed' self.geojsonNonIndexedName = 'polyGeoNonIndexed' self.polyIndexedCollection = MongoProxy(self.externalMongoDb[self.geojsonIndexedName]) self.polyNonIndexedCollection = MongoProxy(self.externalMongoDb[self.geojsonNonIndexedName]) self.pluginTestDir = os.path.dirname(os.path.realpath(__file__)) geojsonPath = os.path.join(self.pluginTestDir, 'data', 'polygons.json') with open(geojsonPath) as geojsonFile: polys = json.load(geojsonFile) for poly in polys: self.polyIndexedCollection.save(poly) self.polyNonIndexedCollection.save(poly) self.polyIndexedCollection.create_index([('geometry', '2dsphere')]) self.collectionName = 'tweetsgeo' self.tweetsgeoCollection = MongoProxy(self.externalMongoDb[self.collectionName]) # add test data to external dataset self.pluginTestDir = os.path.dirname(os.path.realpath(__file__)) tweets100Path = os.path.join(self.pluginTestDir, 'data', 'tweets100.json') z = zipfile.ZipFile('%s.zip' % tweets100Path) tweets = json.load(z.open('tweets100.json')) from datetime import datetime dateformat = '%Y-%m-%dT%H:%M:%S' for tweet in tweets: d = datetime.strptime((tweet['created_at']), dateformat) tweet['created_at'] = int((d - datetime(1970, 1, 1)).total_seconds()) self.tweetsgeoCollection.save(tweet) def tearDown(self): self.externalMongoDbConnection.drop_database(self.dbName) def testMongoDataSourceAndDataset(self): """ Test Mongo source and dataset creation. Test automatic geojson configuration when there is a 2dsphere index or 'geometry' field in the collection. Test that geojson is not automatically configured for a collection that has no 'geometry' field or 2dsphere-indexed field. :return: """ #create a mongo source path = '/minerva_source_mongo' response = self.request( path=path, method='POST', user=self._user, params={ 'name': 'mongogeodatasource', 'dbConnectionUri': self.dbUri } ) self.assertStatusOk(response) minerva_metadata = response.json['meta']['minerva'] self.assertHasKeys(minerva_metadata, ['mongo_connection', 'source_type']) self.assertEquals(minerva_metadata['source_type'], 'mongo', 'expected mongo for source_type') self.assertEquals(minerva_metadata['mongo_connection']['db_uri'], self.dbUri, 'unexpected db uri') #create a mongo dataset from a spatially indexed collection sourceId = response.json['_id'] path = '/minerva_dataset_mongo' response = self.request( path=path, method='POST', user=self._user, params={ 'name': self.geojsonIndexedName, 'mongoSourceId': sourceId, 'mongo_collection': self.geojsonIndexedName } ) self.assertStatusOk(response) minerva_metadata_indexed = response.json['meta']['minerva'] self.assertHasKeys(minerva_metadata_indexed, ['source_id', 'json_row', 'mongo_connection', 'json_row', 'original_type', 'geojson']) self.assertEquals(minerva_metadata_indexed['original_type'], 'mongo', 'expected mongo for original_type') self.assertEquals(minerva_metadata_indexed['mongo_connection']['db_uri'], self.dbUri, 'unexpected db uri') self.assertEquals(minerva_metadata_indexed['mongo_connection']['collection_name'], self.geojsonIndexedName, 'unexpected collection') self.assertHasKeys(minerva_metadata_indexed['geojson'], ['query_count', 'data']) self.assertEquals(minerva_metadata_indexed['geojson']['query_count'], 2) geojson = json.loads(minerva_metadata_indexed['geojson']['data']) self.assertHasKeys(geojson, ['features', 'type']) self.assertEquals(geojson['type'], 'FeatureCollection') #create a mongo dataset from a spatial but non-indexed collection sourceId = response.json['_id'] path = '/minerva_dataset_mongo' response = self.request( path=path, method='POST', user=self._user, params={ 'name': self.geojsonNonIndexedName, 'mongoSourceId': sourceId, 'mongo_collection': self.geojsonNonIndexedName } ) self.assertStatusOk(response) minerva_metadata_nonindexed = response.json['meta']['minerva'] self.assertHasKeys(minerva_metadata_nonindexed, ['source_id', 'json_row', 'mongo_connection', 'json_row', 'original_type', 'geojson']) self.assertEquals(minerva_metadata_nonindexed['original_type'], 'mongo', 'expected mongo for original_type') self.assertEquals(minerva_metadata_nonindexed['mongo_connection']['db_uri'], self.dbUri, 'unexpected db uri') self.assertEquals(minerva_metadata_nonindexed['mongo_connection']['collection_name'], self.geojsonNonIndexedName, 'unexpected collection') self.assertHasKeys(minerva_metadata_nonindexed['geojson'], ['query_count', 'data']) self.assertEquals(minerva_metadata_nonindexed['geojson']['query_count'], 2) geojson = json.loads(minerva_metadata_nonindexed['geojson']['data']) self.assertHasKeys(geojson, ['features', 'type']) self.assertEquals(geojson['type'], 'FeatureCollection') #create a mongo dataset from a collection without a geometry field sourceId = response.json['_id'] path = '/minerva_dataset_mongo' response = self.request( path=path, method='POST', user=self._user, params={ 'name': self.collectionName, 'mongoSourceId': sourceId, 'mongo_collection': self.collectionName } ) self.assertStatusOk(response) minerva_metadata_nogeometry = response.json['meta']['minerva'] self.assertHasKeys(minerva_metadata_nogeometry, ['source_id', 'json_row', 'mongo_connection', 'json_row', 'original_type']) self.assertEquals(minerva_metadata_nogeometry['original_type'], 'mongo', 'expected mongo for original_type') self.assertEquals(minerva_metadata_nogeometry['mongo_connection']['db_uri'], self.dbUri, 'unexpected db uri') self.assertEquals(minerva_metadata_nogeometry['mongo_connection']['collection_name'], self.collectionName, 'unexpected collection') self.assertNotHasKeys(minerva_metadata_nogeometry, ['geojson'])
def getDbConnection(uri=None, replicaSet=None, autoRetry=True, quiet=False, **kwargs): """ Get a MongoClient object that is connected to the configured database. We lazy-instantiate a module-level singleton, the MongoClient objects manage their own connection pools internally. Any extra kwargs you pass to this method will be passed through to the MongoClient. :param uri: if specified, connect to this mongo db rather than the one in the config. :param replicaSet: if uri is specified, use this replica set. :param autoRetry: if this connection should automatically retry operations in the event of an AutoReconnect exception. If you're testing the connection, set this to False. If disabled, this also will not cache the mongo client, so make sure to only disable if you're testing a connection. :type autoRetry: bool :param quiet: if true, don't logprint warnings and success. :type quiet: bool """ global _dbClients origKey = (uri, replicaSet) if origKey in _dbClients: return _dbClients[origKey] dbConf = getDbConfig() if uri is None or uri == '': uri = dbConf.get('uri') replicaSet = dbConf.get('replica_set') clientOptions = { # This is the maximum time between when we fetch data from a cursor. # If it times out, the cursor is lost and we can't reconnect. If it # isn't set, we have issues with replica sets when the primary goes # down. This value can be overridden in the mongodb uri connection # string with the socketTimeoutMS. 'socketTimeoutMS': 60000, 'connectTimeoutMS': 20000, 'serverSelectionTimeoutMS': 20000, 'readPreference': 'secondaryPreferred', 'replicaSet': replicaSet, 'w': 'majority' } # All other options in the [database] section will be passed directly as # options to the mongo client for opt, val in six.viewitems(dict(dbConf)): if opt not in {'uri', 'replica_set'}: clientOptions[opt] = val # Finally, kwargs take precedence clientOptions.update(kwargs) # if the connection URI overrides any option, honor it above our own # settings. uriParams = urllib.parse.parse_qs(urllib.parse.urlparse(uri).query) for key in uriParams: if key in clientOptions: del clientOptions[key] if uri is None: dbUriRedacted = 'mongodb://*****:*****@') if len(parts) == 2: dbUriRedacted = 'mongodb://' + parts[1] else: dbUriRedacted = uri client = pymongo.MongoClient(uri, **clientOptions) if not quiet: desc = '' if replicaSet: desc += ', replica set: %s' % replicaSet logprint.info('Connecting to MongoDB: %s%s' % (dbUriRedacted, desc)) # Make sure we can connect to the mongo server at startup client.server_info() if autoRetry: client = MongoProxy(client, logger=logger) _dbClients[origKey] = _dbClients[(uri, replicaSet)] = client return client
def getDbConnection(uri=None, replicaSet=None, autoRetry=True, **kwargs): """ Get a MongoClient object that is connected to the configured database. We lazy-instantiate a module-level singleton, the MongoClient objects manage their own connection pools internally. Any extra kwargs you pass to this method will be passed through to the MongoClient. :param uri: if specified, connect to this mongo db rather than the one in the config. :param replicaSet: if uri is specified, use this replica set. :param autoRetry: if this connection should automatically retry operations in the event of an AutoReconnect exception. If you're testing the connection, set this to False. If disabled, this also will not cache the mongo client, so make sure to only disable if you're testing a connection. :type autoRetry: bool """ global _dbClients origKey = (uri, replicaSet) if origKey in _dbClients: return _dbClients[origKey] if uri is None or uri == '': dbConf = getDbConfig() uri = dbConf.get('uri') replicaSet = dbConf.get('replica_set') clientOptions = { # This is the maximum time between when we fetch data from a cursor. # If it times out, the cursor is lost and we can't reconnect. If it # isn't set, we have issues with replica sets when the primary goes # down. This value can be overridden in the mongodb uri connection # string with the socketTimeoutMS. 'socketTimeoutMS': 60000, 'connectTimeoutMS': 20000, 'serverSelectionTimeoutMS': 20000, 'read_preference': ReadPreference.SECONDARY_PREFERRED, 'replicaSet': replicaSet } clientOptions.update(kwargs) if uri is None: dbUriRedacted = 'mongodb://*****:*****@') if len(parts) == 2: dbUriRedacted = 'mongodb://' + parts[1] else: dbUriRedacted = uri client = pymongo.MongoClient(uri, **clientOptions) # Make sure we can connect to the mongo server at startup client.server_info() if autoRetry: client = MongoProxy(client, logger=logger) _dbClients[origKey] = _dbClients[(uri, replicaSet)] = client desc = '' if replicaSet: desc += ', replica set: %s' % replicaSet print( TerminalColor.info('Connected to MongoDB: %s%s' % (dbUriRedacted, desc))) return client