Example #1
0
    def setUp(self):
        self.__mongod = [Mongod(port=p, replset=self.rsname) for p in self.ports]
        yield defer.gatherResults([mongo.start() for mongo in self.__mongod])

        master_uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(self.ports[0])
        master = ConnectionPool(master_uri)
        yield master.admin.command("replSetInitiate", self.rsconfig)

        ready = False
        n_tries = int(self.__init_timeout / self.__ping_interval)
        for i in xrange(n_tries):
            yield self.__sleep(self.__ping_interval)

            # My practice shows that we need to query both ismaster and replSetGetStatus
            # to be sure that replica set is up and running, primary is elected and all
            # secondaries are in sync and ready to became new primary

            ismaster_req = master.admin.command("ismaster", check=False)
            replstatus_req = master.admin.command("replSetGetStatus", check=False)
            ismaster, replstatus = yield defer.gatherResults([ismaster_req, replstatus_req])

            initialized = replstatus["ok"]
            ok_states = set(["PRIMARY", "SECONDARY"])
            states_ready = all(m["stateStr"] in ok_states for m in replstatus.get("members", []))
            ready = initialized and ismaster["ismaster"] and states_ready

            if ready:
                break

        if not ready:
            yield self.tearDown()
            raise Exception("ReplicaSet initialization took more than {0}s".format(self.__init_timeout))

        yield master.disconnect()
Example #2
0
    def test_SlaveOk(self):
        uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(self.ports[1])
        conn = ConnectionPool(uri)
        try:
            empty = yield conn.db.coll.find(flags=QUERY_SLAVE_OK)
            self.assertEqual(empty, [])

            yield self.assertFailure(conn.db.coll.insert({'x': 42}), OperationFailure)
        finally:
            yield conn.disconnect()
Example #3
0
    def test_AutoReconnect_from_primary_step_down(self):
        uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports))
        conn = ConnectionPool(uri, max_delay=5)

        # this will force primary to step down, triggering an AutoReconnect that bubbles up
        # through the connection pool to the client
        command = conn.admin.command(SON([('replSetStepDown', 86400), ('force', 1)]))
        self.assertFailure(command, AutoReconnect)

        yield conn.disconnect()
Example #4
0
    def test_AutoReconnect_from_primary_step_down(self):
        self.patch(_Connection, 'maxDelay', 5)
        uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports))
        conn = ConnectionPool(uri)

        # this will force primary to step down, triggering an AutoReconnect that bubbles up
        # through the connection pool to the client
        command = conn.admin.command(SON([('replSetStepDown', 86400), ('force', 1)]))
        self.assertFailure(command, AutoReconnect)

        yield conn.disconnect()
Example #5
0
    def test_ConnectionUrlParams(self):
        conn = ConnectionPool("mongodb://{0}:{1}/?w=2&j=true".format(mongo_host, mongo_port))
        coll = conn.mydb.mycol

        try:
            with self.mock_gle() as mock:
                yield coll.insert({'x': 42})
                mock.assert_called_once_with('mydb', w=2, j=True)
        finally:
            yield coll.drop()
            yield conn.disconnect()
Example #6
0
    def test_ConnectionUrlParams(self):
        conn = ConnectionPool("mongodb://{0}:{1}/?w=2&j=true".format(
            mongo_host, mongo_port))
        coll = conn.mydb.mycol

        try:
            with self.mock_gle() as mock:
                yield coll.insert({'x': 42})
                mock.assert_called_once_with('mydb', w=2, j=True)
        finally:
            yield coll.drop()
            yield conn.disconnect()
Example #7
0
    def test_SlaveOk(self):
        uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(self.ports[1])
        conn = ConnectionPool(uri)
        try:
            empty = yield conn.db.coll.find(flags=QUERY_SLAVE_OK)
            self.assertEqual(empty, [])

            server_status = yield conn.admin.command("serverStatus")
            _version = [int(part) for part in server_status["version"].split('.')]

            expected_error = AutoReconnect if _version > [4, 2] else OperationFailure
            yield self.assertFailure(conn.db.coll.insert({'x': 42}), expected_error)
        finally:
            yield conn.disconnect()
Example #8
0
    def setUp(self):
        self.__mongod = [
            Mongod(port=p, replset=self.rsname) for p in self.ports
        ]
        yield defer.gatherResults([mongo.start() for mongo in self.__mongod])

        yield defer.gatherResults(
            [self.__check_reachable(port) for port in self.ports])

        master_uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(
            self.ports[0])
        master = ConnectionPool(master_uri)
        yield master.admin.command("replSetInitiate", self.rsconfig)

        ready = False
        n_tries = int(self.__init_timeout / self.__ping_interval)
        for i in range(n_tries):
            yield self.__sleep(self.__ping_interval)

            # My practice shows that we need to query both ismaster and replSetGetStatus
            # to be sure that replica set is up and running, primary is elected and all
            # secondaries are in sync and ready to became new primary

            ismaster_req = master.admin.command("ismaster", check=False)
            replstatus_req = master.admin.command("replSetGetStatus",
                                                  check=False)
            ismaster, replstatus = yield defer.gatherResults(
                [ismaster_req, replstatus_req])

            initialized = replstatus["ok"]
            ok_states = {"PRIMARY", "SECONDARY"}
            states_ready = all(m["stateStr"] in ok_states
                               for m in replstatus.get("members", []))
            ready = initialized and ismaster["ismaster"] and states_ready

            if ready:
                break

        if not ready:
            yield self.tearDown()
            raise Exception(
                "ReplicaSet initialization took more than {0}s".format(
                    self.__init_timeout))

        yield master.disconnect()
Example #9
0
    def test_AutoReconnect(self):
        try:
            uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports))
            conn = ConnectionPool(uri, max_delay=5)

            yield conn.db.coll.insert({'x': 42}, safe=True)

            self.__mongod[0].kill(signal.SIGSTOP)

            while True:
                try:
                    result = yield conn.db.coll.find_one()
                    self.assertEqual(result['x'], 42)
                    break
                except AutoReconnect:
                    pass

        finally:
            self.__mongod[0].kill(signal.SIGCONT)
            yield conn.disconnect()
            self.flushLoggedErrors(AutoReconnect)
Example #10
0
    def test_AutoReconnect(self):
        self.patch(_Connection, 'maxDelay', 5)

        try:
            uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports))
            conn = ConnectionPool(uri)

            yield conn.db.coll.insert({'x': 42}, safe=True)

            yield self.__mongod[0].stop()

            while True:
                try:
                    result = yield conn.db.coll.find_one()
                    self.assertEqual(result['x'], 42)
                    break
                except AutoReconnect:
                    pass

        finally:
            yield conn.disconnect()
            self.flushLoggedErrors(AutoReconnect)
Example #11
0
    def test_TimeExceeded_insert(self):
        try:
            uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports))
            conn = ConnectionPool(uri, retry_delay=3, max_delay=5)

            yield conn.db.coll.insert({'x': 42}, safe=True)

            self.__mongod[0].kill(signal.SIGSTOP)

            while True:
                try:
                    yield conn.db.coll.insert({'y': 42}, safe=True, timeout=2)
                    self.fail("TimeExceeded not raised!")
                except TimeExceeded:
                    break  # this is what we should have returned
                except AutoReconnect:
                    pass

        finally:
            self.__mongod[0].kill(signal.SIGCONT)
            yield conn.disconnect()
            self.flushLoggedErrors(AutoReconnect)
Example #12
0
    def test_TimeExceeded_insert(self):
        self.patch(_Connection, 'maxDelay', 5)

        try:
            uri = "mongodb://localhost:{0}/?w={1}".format(self.ports[0], len(self.ports))
            conn = ConnectionPool(uri, initial_delay=3)

            yield conn.db.coll.insert({'x': 42}, safe=True)

            yield self.__mongod[0].stop()

            while True:
                try:
                    yield conn.db.coll.insert({'y': 42}, safe=True, timeout=2)
                    self.fail("TimeExceeded not raised!")
                except TimeExceeded:
                    break  # this is what we should have returned
                except AutoReconnect:
                    pass

        finally:
            yield conn.disconnect()
            self.flushLoggedErrors(AutoReconnect)
Example #13
0
 def __check_reachable(self, port):
     uri = "mongodb://localhost:{0}/?readPreference=secondaryPreferred".format(port)
     conn = ConnectionPool(uri)
     yield conn.admin.command("ismaster", check=False)
     yield conn.disconnect()
Example #14
0
class MongodbPipeline(object):
    """
    Pipeline that writes to Mongo Database
    """

    @classmethod
    def from_crawler(cls, crawler):
        """Retrieves scrapy crawler and accesses pipeline's settings"""

        # Get MongoDB URL from settings
        mongo_url = crawler.settings.get('MONGO_DB_URI', None)
        mongo_db = crawler.settings.get('STAGING_DB', None)
        mongo_collection = crawler.settings.get('TABLE_PORTS', None)

        config = [mongo_url, mongo_db, mongo_collection]

        # If doesn't exist, disable the pipeline
        if not any(config):
            raise NotConfigured('Mongodb parameters not configured')

        # Create the class
        return cls(config)

    def __init__(self, config):
        """Opens a MongoDB connection pool"""

        # Report connection error only once
        self.report_connection_error = True

        mongo_url, mongo_db, mongo_collection = config
        if 'mongo:' in mongo_url:
            mongo_url = mongo_url.replace('mongo:', 'mongodb:')
        # Setup MongoDB Connection
        self.mongo_url = mongo_url
        self.connection = ConnectionPool(mongo_url, connect_timeout=5)
        self.mongo_db = self.connection[mongo_db]
        self.collection = self.mongo_db[mongo_collection]

    def close_spider(self, spider):
        """Discard the database on spider close"""
        self.connection.disconnect()

    @defer.inlineCallbacks
    def process_item(self, item, spider):
        """Processes the item. Does upsert into MongoDB"""
        logger = spider.logger
        try:
            yield self.collection.replace_one(
                filter=item,
                replacement=item,
                upsert=True
            )
        except Exception:
            if self.report_connection_error:
                logger.error("An error occured while Upserting data")
                self.report_connection_error = False
                logger.error(traceback.format_exc())
                raise TransactionError('An error occured during transaction.')

        # Return the item for the next stage
        defer.returnValue(item)
class PipelineMongoDBAsync(object):
    def __init__(self, crawler: Crawler, *args, **kwargs):
        self.crawler = crawler
        self.settings = crawler.settings

        self.uri = get_mongodb_uri(self.settings)
        self.codec_options = DEFAULT_CODEC_OPTIONS.with_options(
            unicode_decode_error_handler='ignore')
        self.cnx = None
        self.db = None
        self.coll = None

    @classmethod
    def from_crawler(cls, crawler: Crawler, *args, **kwargs):
        cls.process_item = (load_object(crawler.settings[MONGODB_PROCESS_ITEM])
                            if crawler.settings.get(MONGODB_PROCESS_ITEM) else
                            lambda pipeline, item, spider: item)
        o = cls(crawler=crawler, *args, **kwargs)
        crawler.signals.connect(o.process_item_insert_one,
                                signal=insert_for_objectid)
        crawler.signals.connect(o.process_item_update_one,
                                signal=update_for_objectid)
        return o

    @inlineCallbacks
    def open_spider(self, spider: Spider):
        self.cnx = ConnectionPool(self.uri, codec_options=self.codec_options)
        self.db = getattr(self.cnx, self.settings[MONGODB_DATABASE])
        self.coll = getattr(self.db, self.settings[MONGODB_COLLECTION])
        self.coll.with_options(codec_options=self.codec_options)

        result = yield self.create_index(spider)
        logger.info('Spider opened: Open the connection to MongoDB: %s',
                    self.uri)

    # TODO: ADD UNIT TEST FOR THIS FUNCTION
    # the api of create_index in txmongo is different with the one in mongomock
    @inlineCallbacks
    def create_index(self, spider: Spider):
        results = []
        for field, _order, *args in self.settings.get(MONGODB_INDEXES, list()):
            try:
                _ = yield self.coll.create_index(txfilter.sort(_order(field)),
                                                 **args[0])
                results.append(_)
            except OperationFailure:
                pass
        return results

    @inlineCallbacks
    def close_spider(self, spider: Spider):
        yield self.cnx.disconnect()
        logger.info('Spider closed: Close the connection to MongoDB %s',
                    self.uri)

    @inlineCallbacks
    def process_item_insert_one(self, doc: Dict, spider: Spider) -> Generator:
        result = yield self.coll.insert_one(doc)
        return result

    @inlineCallbacks
    def process_item_update_one(self, filter_: Dict, update: Dict,
                                upsert: bool, spider: Spider) -> Generator:
        result = yield self.coll.update_one(filter=filter_,
                                            update=update,
                                            upsert=upsert)
        return result
Example #16
0
class TestCancelIntegrated(unittest.TestCase):

    def setUp(self):
        self.conn = ConnectionPool()
        self.db = self.conn.db
        self.coll = self.db.coll

    @defer.inlineCallbacks
    def tearDown(self):
        yield self.coll.drop()
        yield self.conn.disconnect()

    @defer.inlineCallbacks
    def test_integration(self):
        # Our ConnectionPool is not actually connected yet, so on this
        # stage operations can be safely cancelled -- they won't be
        # sent to MongoDB at all. This test checks this.

        d1 = self.coll.insert_one({'x': 1})
        d2 = self.coll.insert_one({'x': 2})
        d3 = self.coll.insert_one({'x': 3})
        d4 = self.coll.insert_one({'x': 4})

        d1.cancel()
        d3.cancel()

        yield d4

        self.failureResultOf(d1, defer.CancelledError)
        self.assertTrue(d2.called)
        self.failureResultOf(d3, defer.CancelledError)

        docs = yield self.coll.distinct('x')
        self.assertEqual(set(docs), {2, 4})

    @defer.inlineCallbacks
    def test_remove(self):
        # Lets test cancellation of some dangerous operation for the peace
        # of mind. NB: remove can be cancelled only because ConnectionPool
        # is not connected yet.
        for i in range(10):
            self.coll.insert_one({'x': i})

        d1 = self.coll.remove({'x': {"$lt": 3}})
        d2 = self.coll.remove({'x': {"$gte": 3, "$lt": 6}})
        d3 = self.coll.remove({'x': {"$gte": 6, "$lt": 9}})

        d2.cancel()

        yield d3

        self.assertTrue(d1.called)
        self.failureResultOf(d2, defer.CancelledError)

        x = yield self.coll.distinct('x')
        self.assertEqual(set(x), {3, 4, 5, 9})

    @defer.inlineCallbacks
    def test_no_way(self):
        # If ConnectionPool picks already active connection, the query is sent
        # to MongoDB immediately and there is no way to cancel it

        yield self.coll.count()

        d = self.coll.insert({'x': 42})
        d.cancel()

        yield _delay(1)

        self.failureResultOf(d, defer.CancelledError)

        cnt = yield self.coll.count()
        self.assertEqual(cnt, 1)