def setUp(self): self.__mongod = [Mongod(port=p, replset=self.rsname) for p in self.ports] yield defer.gatherResults([mongo.start() for mongo in self.__mongod]) master_uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(self.ports[0]) master = ConnectionPool(master_uri) yield master.admin.command("replSetInitiate", self.rsconfig) ready = False n_tries = int(self.__init_timeout / self.__ping_interval) for i in xrange(n_tries): yield self.__sleep(self.__ping_interval) # My practice shows that we need to query both ismaster and replSetGetStatus # to be sure that replica set is up and running, primary is elected and all # secondaries are in sync and ready to became new primary ismaster_req = master.admin.command("ismaster", check=False) replstatus_req = master.admin.command("replSetGetStatus", check=False) ismaster, replstatus = yield defer.gatherResults([ismaster_req, replstatus_req]) initialized = replstatus["ok"] ok_states = set(["PRIMARY", "SECONDARY"]) states_ready = all(m["stateStr"] in ok_states for m in replstatus.get("members", [])) ready = initialized and ismaster["ismaster"] and states_ready if ready: break if not ready: yield self.tearDown() raise Exception("ReplicaSet initialization took more than {0}s".format(self.__init_timeout)) yield master.disconnect()
def test_SlaveOk(self): uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(self.ports[1]) conn = ConnectionPool(uri) try: empty = yield conn.db.coll.find(flags=QUERY_SLAVE_OK) self.assertEqual(empty, []) yield self.assertFailure(conn.db.coll.insert({'x': 42}), OperationFailure) finally: yield conn.disconnect()
def test_AutoReconnect_from_primary_step_down(self): uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports)) conn = ConnectionPool(uri, max_delay=5) # this will force primary to step down, triggering an AutoReconnect that bubbles up # through the connection pool to the client command = conn.admin.command(SON([('replSetStepDown', 86400), ('force', 1)])) self.assertFailure(command, AutoReconnect) yield conn.disconnect()
def test_AutoReconnect_from_primary_step_down(self): self.patch(_Connection, 'maxDelay', 5) uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports)) conn = ConnectionPool(uri) # this will force primary to step down, triggering an AutoReconnect that bubbles up # through the connection pool to the client command = conn.admin.command(SON([('replSetStepDown', 86400), ('force', 1)])) self.assertFailure(command, AutoReconnect) yield conn.disconnect()
def test_ConnectionUrlParams(self): conn = ConnectionPool("mongodb://{0}:{1}/?w=2&j=true".format(mongo_host, mongo_port)) coll = conn.mydb.mycol try: with self.mock_gle() as mock: yield coll.insert({'x': 42}) mock.assert_called_once_with('mydb', w=2, j=True) finally: yield coll.drop() yield conn.disconnect()
def test_ConnectionUrlParams(self): conn = ConnectionPool("mongodb://{0}:{1}/?w=2&j=true".format( mongo_host, mongo_port)) coll = conn.mydb.mycol try: with self.mock_gle() as mock: yield coll.insert({'x': 42}) mock.assert_called_once_with('mydb', w=2, j=True) finally: yield coll.drop() yield conn.disconnect()
def test_SlaveOk(self): uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(self.ports[1]) conn = ConnectionPool(uri) try: empty = yield conn.db.coll.find(flags=QUERY_SLAVE_OK) self.assertEqual(empty, []) server_status = yield conn.admin.command("serverStatus") _version = [int(part) for part in server_status["version"].split('.')] expected_error = AutoReconnect if _version > [4, 2] else OperationFailure yield self.assertFailure(conn.db.coll.insert({'x': 42}), expected_error) finally: yield conn.disconnect()
def setUp(self): self.__mongod = [ Mongod(port=p, replset=self.rsname) for p in self.ports ] yield defer.gatherResults([mongo.start() for mongo in self.__mongod]) yield defer.gatherResults( [self.__check_reachable(port) for port in self.ports]) master_uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format( self.ports[0]) master = ConnectionPool(master_uri) yield master.admin.command("replSetInitiate", self.rsconfig) ready = False n_tries = int(self.__init_timeout / self.__ping_interval) for i in range(n_tries): yield self.__sleep(self.__ping_interval) # My practice shows that we need to query both ismaster and replSetGetStatus # to be sure that replica set is up and running, primary is elected and all # secondaries are in sync and ready to became new primary ismaster_req = master.admin.command("ismaster", check=False) replstatus_req = master.admin.command("replSetGetStatus", check=False) ismaster, replstatus = yield defer.gatherResults( [ismaster_req, replstatus_req]) initialized = replstatus["ok"] ok_states = {"PRIMARY", "SECONDARY"} states_ready = all(m["stateStr"] in ok_states for m in replstatus.get("members", [])) ready = initialized and ismaster["ismaster"] and states_ready if ready: break if not ready: yield self.tearDown() raise Exception( "ReplicaSet initialization took more than {0}s".format( self.__init_timeout)) yield master.disconnect()
def test_AutoReconnect(self): try: uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports)) conn = ConnectionPool(uri, max_delay=5) yield conn.db.coll.insert({'x': 42}, safe=True) self.__mongod[0].kill(signal.SIGSTOP) while True: try: result = yield conn.db.coll.find_one() self.assertEqual(result['x'], 42) break except AutoReconnect: pass finally: self.__mongod[0].kill(signal.SIGCONT) yield conn.disconnect() self.flushLoggedErrors(AutoReconnect)
def test_AutoReconnect(self): self.patch(_Connection, 'maxDelay', 5) try: uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports)) conn = ConnectionPool(uri) yield conn.db.coll.insert({'x': 42}, safe=True) yield self.__mongod[0].stop() while True: try: result = yield conn.db.coll.find_one() self.assertEqual(result['x'], 42) break except AutoReconnect: pass finally: yield conn.disconnect() self.flushLoggedErrors(AutoReconnect)
def test_TimeExceeded_insert(self): try: uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports)) conn = ConnectionPool(uri, retry_delay=3, max_delay=5) yield conn.db.coll.insert({'x': 42}, safe=True) self.__mongod[0].kill(signal.SIGSTOP) while True: try: yield conn.db.coll.insert({'y': 42}, safe=True, timeout=2) self.fail("TimeExceeded not raised!") except TimeExceeded: break # this is what we should have returned except AutoReconnect: pass finally: self.__mongod[0].kill(signal.SIGCONT) yield conn.disconnect() self.flushLoggedErrors(AutoReconnect)
def test_TimeExceeded_insert(self): self.patch(_Connection, 'maxDelay', 5) try: uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports)) conn = ConnectionPool(uri, initial_delay=3) yield conn.db.coll.insert({'x': 42}, safe=True) yield self.__mongod[0].stop() while True: try: yield conn.db.coll.insert({'y': 42}, safe=True, timeout=2) self.fail("TimeExceeded not raised!") except TimeExceeded: break # this is what we should have returned except AutoReconnect: pass finally: yield conn.disconnect() self.flushLoggedErrors(AutoReconnect)
def __check_reachable(self, port): uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(port) conn = ConnectionPool(uri) yield conn.admin.command("ismaster", check=False) yield conn.disconnect()
class MongodbPipeline(object): """ Pipeline that writes to Mongo Database """ @classmethod def from_crawler(cls, crawler): """Retrieves scrapy crawler and accesses pipeline's settings""" # Get MongoDB URL from settings mongo_url = crawler.settings.get('MONGO_DB_URI', None) mongo_db = crawler.settings.get('STAGING_DB', None) mongo_collection = crawler.settings.get('TABLE_PORTS', None) config = [mongo_url, mongo_db, mongo_collection] # If doesn't exist, disable the pipeline if not any(config): raise NotConfigured('Mongodb parameters not configured') # Create the class return cls(config) def __init__(self, config): """Opens a MongoDB connection pool""" # Report connection error only once self.report_connection_error = True mongo_url, mongo_db, mongo_collection = config if 'mongo:' in mongo_url: mongo_url = mongo_url.replace('mongo:', 'mongodb:') # Setup MongoDB Connection self.mongo_url = mongo_url self.connection = ConnectionPool(mongo_url, connect_timeout=5) self.mongo_db = self.connection[mongo_db] self.collection = self.mongo_db[mongo_collection] def close_spider(self, spider): """Discard the database on spider close""" self.connection.disconnect() @defer.inlineCallbacks def process_item(self, item, spider): """Processes the item. Does upsert into MongoDB""" logger = spider.logger try: yield self.collection.replace_one( filter=item, replacement=item, upsert=True ) except Exception: if self.report_connection_error: logger.error("An error occured while Upserting data") self.report_connection_error = False logger.error(traceback.format_exc()) raise TransactionError('An error occured during transaction.') # Return the item for the next stage defer.returnValue(item)
class PipelineMongoDBAsync(object): def __init__(self, crawler: Crawler, *args, **kwargs): self.crawler = crawler self.settings = crawler.settings self.uri = get_mongodb_uri(self.settings) self.codec_options = DEFAULT_CODEC_OPTIONS.with_options( unicode_decode_error_handler='ignore') self.cnx = None self.db = None self.coll = None @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs): cls.process_item = (load_object(crawler.settings[MONGODB_PROCESS_ITEM]) if crawler.settings.get(MONGODB_PROCESS_ITEM) else lambda pipeline, item, spider: item) o = cls(crawler=crawler, *args, **kwargs) crawler.signals.connect(o.process_item_insert_one, signal=insert_for_objectid) crawler.signals.connect(o.process_item_update_one, signal=update_for_objectid) return o @inlineCallbacks def open_spider(self, spider: Spider): self.cnx = ConnectionPool(self.uri, codec_options=self.codec_options) self.db = getattr(self.cnx, self.settings[MONGODB_DATABASE]) self.coll = getattr(self.db, self.settings[MONGODB_COLLECTION]) self.coll.with_options(codec_options=self.codec_options) result = yield self.create_index(spider) logger.info('Spider opened: Open the connection to MongoDB: %s', self.uri) # TODO: ADD UNIT TEST FOR THIS FUNCTION # the api of create_index in txmongo is different with the one in mongomock @inlineCallbacks def create_index(self, spider: Spider): results = [] for field, _order, *args in self.settings.get(MONGODB_INDEXES, list()): try: _ = yield self.coll.create_index(txfilter.sort(_order(field)), **args[0]) results.append(_) except OperationFailure: pass return results @inlineCallbacks def close_spider(self, spider: Spider): yield self.cnx.disconnect() logger.info('Spider closed: Close the connection to MongoDB %s', self.uri) @inlineCallbacks def process_item_insert_one(self, doc: Dict, spider: Spider) -> Generator: result = yield self.coll.insert_one(doc) return result @inlineCallbacks def process_item_update_one(self, filter_: Dict, update: Dict, upsert: bool, spider: Spider) -> Generator: result = yield self.coll.update_one(filter=filter_, update=update, upsert=upsert) return result
class TestCancelIntegrated(unittest.TestCase): def setUp(self): self.conn = ConnectionPool() self.db = self.conn.db self.coll = self.db.coll @defer.inlineCallbacks def tearDown(self): yield self.coll.drop() yield self.conn.disconnect() @defer.inlineCallbacks def test_integration(self): # Our ConnectionPool is not actually connected yet, so on this # stage operations can be safely cancelled -- they won't be # sent to MongoDB at all. This test checks this. d1 = self.coll.insert_one({'x': 1}) d2 = self.coll.insert_one({'x': 2}) d3 = self.coll.insert_one({'x': 3}) d4 = self.coll.insert_one({'x': 4}) d1.cancel() d3.cancel() yield d4 self.failureResultOf(d1, defer.CancelledError) self.assertTrue(d2.called) self.failureResultOf(d3, defer.CancelledError) docs = yield self.coll.distinct('x') self.assertEqual(set(docs), {2, 4}) @defer.inlineCallbacks def test_remove(self): # Lets test cancellation of some dangerous operation for the peace # of mind. NB: remove can be cancelled only because ConnectionPool # is not connected yet. for i in range(10): self.coll.insert_one({'x': i}) d1 = self.coll.remove({'x': {"$lt": 3}}) d2 = self.coll.remove({'x': {"$gte": 3, "$lt": 6}}) d3 = self.coll.remove({'x': {"$gte": 6, "$lt": 9}}) d2.cancel() yield d3 self.assertTrue(d1.called) self.failureResultOf(d2, defer.CancelledError) x = yield self.coll.distinct('x') self.assertEqual(set(x), {3, 4, 5, 9}) @defer.inlineCallbacks def test_no_way(self): # If ConnectionPool picks already active connection, the query is sent # to MongoDB immediately and there is no way to cancel it yield self.coll.count() d = self.coll.insert({'x': 42}) d.cancel() yield _delay(1) self.failureResultOf(d, defer.CancelledError) cnt = yield self.coll.count() self.assertEqual(cnt, 1)