Beispiel #1
0
    def __init__(self, conf):
        if not isinstance(conf, Config):
            raise RuntimeError('invalid config type')
        self._conf = conf

        self._ignore_dbs = ['admin', 'local']
        self._ignore_colls = [
            'system.indexes', 'system.profile', 'system.users'
        ]

        if conf.optime_logfilepath:
            self._optime_logger = OptimeLogger(conf.optime_logfilepath)
        else:
            self._optime_logger = None
        self._optime_log_interval = 10  # default 10s
        self._last_optime = None  # optime of the last oplog was applied
        self._last_optime_logtime = time.time()

        self._log_interval = 2  # default 2s
        self._last_logtime = time.time()  # use in oplog replay

        # for large collections
        self._n_workers = 8  # multi-process
        self._large_coll_docs = 1000000  # 100w

        self._initial_sync_start_optime = None
        self._initial_sync_end_optime = None

        self._stage = Stage.STOPPED
    def parse():
        """ Parse command options and generate config.
        """
        conf = Config()

        parser = argparse.ArgumentParser(description='Sync data from a replica-set to another MongoDB/Elasticsearch.')
        parser.add_argument('-f', '--config', nargs='?', required=False, help='configuration file, note that command options will override items in config file')
        parser.add_argument('--src', nargs='?', required=False, help='source should be hostportstr of a replica-set member')
        parser.add_argument('--src-authdb', nargs='?', required=False, help="src authentication database, default is 'admin'")
        parser.add_argument('--src-username', nargs='?', required=False, help='src username')
        parser.add_argument('--src-password', nargs='?', required=False, help='src password')
        parser.add_argument('--dst', nargs='?', required=False, help='destination should be hostportstr of a mongos or mongod instance')
        parser.add_argument('--dst-authdb', nargs='?', required=False, help="dst authentication database, default is 'admin', for MongoDB")
        parser.add_argument('--dst-username', nargs='?', required=False, help='dst username, for MongoDB')
        parser.add_argument('--dst-password', nargs='?', required=False, help='dst password, for MongoDB')
        parser.add_argument('--start-optime', type=int, nargs='?', required=False, help='timestamp in second, indicates oplog based increment sync')
        parser.add_argument('--optime-logfile', nargs='?', required=False, help="optime log file path, use this as start optime if without '--start-optime'")
        parser.add_argument('--logfile', nargs='?', required=False, help='log file path')

        args = parser.parse_args()

        if args.config is not None:
            conf = ConfigFile.load(args.config)
        if args.src is not None:
            conf.src_conf.hosts = args.src
        if args.src_authdb is not None:
            conf.src_conf.authdb = args.src_authdb
        if args.src_username is not None:
            conf.src_conf.username = args.src_username
        if args.src_password is not None:
            conf.src_conf.password = args.src_password
        if args.dst is not None:
            conf.dst_conf.hosts = args.dst
        if args.dst_authdb is not None:
            conf.dst_conf.authdb = args.dst_authdb
        if args.dst_username is not None:
            conf.dst_conf.username = args.dst_username
        if args.dst_password is not None:
            conf.dst_conf.password = args.dst_password
        if args.start_optime is not None:
            conf.start_optime = Timestamp(args.start_optime, 0)
        if args.optime_logfile is not None:
            conf.optime_logfilepath = args.optime_logfile
            if args.start_optime is None:
                optime_logger = OptimeLogger(args.optime_logfile)
                conf.start_optime = optime_logger.read()
        if args.logfile is not None:
            conf.logfilepath = args.logfile

        return conf
Beispiel #3
0
    def parse():
        """ Parse command options and generate config.
        """
        conf = Config()

        parser = argparse.ArgumentParser(description='Sync data from a replica-set to another MongoDB/Elasticsearch.')
        parser.add_argument('-f', '--config', nargs='?', required=False, help='configuration file, note that command options will override items in config file')
        parser.add_argument('--src', nargs='?', required=False, help='source should be hostportstr of a replica-set member')
        parser.add_argument('--src-authdb', nargs='?', required=False, help="src authentication database, default is 'admin'")
        parser.add_argument('--src-username', nargs='?', required=False, help='src username')
        parser.add_argument('--src-password', nargs='?', required=False, help='src password')
        parser.add_argument('--dst', nargs='?', required=False, help='destination should be hostportstr of a mongos or mongod instance')
        parser.add_argument('--dst-authdb', nargs='?', required=False, help="dst authentication database, default is 'admin', for MongoDB")
        parser.add_argument('--dst-username', nargs='?', required=False, help='dst username, for MongoDB')
        parser.add_argument('--dst-password', nargs='?', required=False, help='dst password, for MongoDB')
        parser.add_argument('--start-optime', type=int, nargs='?', required=False, help='timestamp in second, indicates oplog based increment sync')
        parser.add_argument('--optime-logfile', nargs='?', required=False, help="optime log file path, use this as start optime if without '--start-optime'")
        parser.add_argument('--logfile', nargs='?', required=False, help='log file path')

        args = parser.parse_args()

        if args.config is not None:
            conf = ConfigFile.load(args.config)
        if args.src is not None:
            conf.src_conf.hosts = args.src
        if args.src_authdb is not None:
            conf.src_conf.authdb = args.src_authdb
        if args.src_username is not None:
            conf.src_conf.username = args.src_username
        if args.src_password is not None:
            conf.src_conf.password = args.src_password
        if args.dst is not None:
            conf.dst_conf.hosts = args.dst
        if args.dst_authdb is not None:
            conf.dst_conf.authdb = args.dst_authdb
        if args.dst_username is not None:
            conf.dst_conf.username = args.dst_username
        if args.dst_password is not None:
            conf.dst_conf.password = args.dst_password
        if args.start_optime is not None:
            conf.start_optime = Timestamp(args.start_optime, 0)
        if args.optime_logfile is not None:
            conf.optime_logfilepath = args.optime_logfile
            if args.start_optime is None:
                optime_logger = OptimeLogger(args.optime_logfile)
                conf.start_optime = optime_logger.read()
        if args.logfile is not None:
            conf.logfilepath = args.logfile

        return conf
Beispiel #4
0
    def __init__(self, conf):
        if not isinstance(conf, Config):
            raise Exception('invalid config type')
        self._conf = conf

        self._ignore_dbs = ['admin', 'local']
        self._ignore_colls = [
            'system.indexes', 'system.profile', 'system.users'
        ]

        if conf.optime_logfilepath:
            self._optime_logger = OptimeLogger(conf.optime_logfilepath)
        else:
            self._optime_logger = None
        self._optime_log_interval = 10  # default 10s
        self._last_optime = None  # optime of the last oplog has been replayed
        self._last_optime_logtime = time.time()

        self._log_interval = 2  # default 2s
        self._last_logtime = time.time()  # use in oplog replay
Beispiel #5
0
class CommonSyncer(object):
    """ Common database synchronizer.

    Specific database synchronizer should implement the following methods:
        - __init__
        - _initial_sync
        - _sync_collection
        - _sync_large_collection
        - _replay_oplog
    """
    def __init__(self, conf):
        if not isinstance(conf, Config):
            raise RuntimeError('invalid config type')
        self._conf = conf

        self._ignore_dbs = ['admin', 'local']
        self._ignore_colls = [
            'system.indexes', 'system.profile', 'system.users'
        ]

        if conf.optime_logfilepath:
            self._optime_logger = OptimeLogger(conf.optime_logfilepath)
        else:
            self._optime_logger = None
        self._optime_log_interval = 10  # default 10s
        self._last_optime = None  # optime of the last oplog was applied
        self._last_optime_logtime = time.time()

        self._log_interval = 2  # default 2s
        self._last_logtime = time.time()  # use in oplog replay

        # for large collections
        self._n_workers = 8  # multi-process
        self._large_coll_docs = 1000000  # 100w

        self._initial_sync_start_optime = None
        self._initial_sync_end_optime = None

        self._stage = Stage.STOPPED

    @property
    def from_to(self):
        return "%s => %s" % (self._conf.src_hostportstr,
                             self._conf.dst_hostportstr)

    @property
    def log_interval(self):
        return self._log_interval

    @log_interval.setter
    def log_interval(self, n_secs):
        if n_secs < 0:
            n_secs = 0
        self._log_interval = n_secs

    def run(self):
        """ Start to sync.
        """
        # never drop database automatically
        # clear data manually if necessary
        try:
            self._sync()
        except KeyboardInterrupt:
            log.info('keyboard interrupt')

    def _sync(self):
        """ Sync databases and oplog.
        """
        if self._conf.start_optime:
            log.info("locating oplog, it will take a while")
            doc = None
            cur = self._src.client()['local']['oplog.rs'].find(
                {
                    'ts': {
                        '$lte': self._conf.start_optime
                    }
                }, {
                    "ts": 1
                }).sort("$natural", -1).limit(1)
            try:
                doc = cur.next()
            except StopIteration:
                pass
            if not doc:
                log.error('oplog is stale')
                return
            start_optime = doc['ts']
            log.info('start timestamp is %s actually' % start_optime)
            self._stage = Stage.OPLOG_SYNC
            self._replay_oplog(start_optime)
        else:
            # initial sync
            self._initial_sync_start_optime = get_optime(self._src.client())
            self._stage = Stage.INITIAL_SYNC

            self._initial_sync()

            self._stage = Stage.POST_INITIAL_SYNC
            self._initial_sync_end_optime = get_optime(self._src.client())

            # oplog sync
            if self._optime_logger:
                self._optime_logger.write(self._initial_sync_start_optime)
            self._replay_oplog(self._initial_sync_start_optime)

    def _collect_colls(self):
        """ Collect collections to sync.
        """
        colls = []
        for dbname in self._src.client().database_names():
            if dbname in self._ignore_dbs:
                continue
            if not self._conf.data_filter.valid_db(dbname):
                continue
            for collname in self._src.client()[dbname].collection_names(
                    include_system_collections=False):
                if collname in self._ignore_colls:
                    continue
                if not self._conf.data_filter.valid_coll(dbname, collname):
                    continue
                colls.append((dbname, collname))
        return colls

    def _split_coll(self, namespace_tuple, n_partitions):
        """ Split a collection into n partitions.

        Return a list of split points.

        splitPointCount = partitionCount - 1
        splitPointCount = keyTotalCount / (keyCount + 1)
        keyCount = maxChunkSize / (2 * avgObjSize)
        =>
        maxChunkSize = (keyTotalCount / (partionCount - 1) - 1) * 2 * avgObjSize

        Note: maxChunkObjects is default 250000.
        """
        if n_partitions <= 1:
            raise RuntimeError('n_partitions need greater than 1, but %s' %
                               n_partitions)

        dbname, collname = namespace_tuple
        ns = '.'.join(namespace_tuple)
        db = self._src.client()[dbname]
        collstats = db.command('collstats', collname)

        if 'avgObjSize' not in collstats:  # empty collection
            return []

        n_points = n_partitions - 1
        max_chunk_size = int(
            ((collstats['count'] /
              (n_partitions - 1) - 1) * 2 * collstats['avgObjSize']) / 1024 /
            1024)

        if max_chunk_size <= 0:
            return []

        res = db.command('splitVector',
                         ns,
                         keyPattern={'_id': 1},
                         maxSplitPoints=n_points,
                         maxChunkSize=max_chunk_size,
                         maxChunkObjects=collstats['count'])

        if res['ok'] != 1:
            return []
        else:
            return [doc['_id'] for doc in res['splitKeys']]

    def _initial_sync(self):
        """ Initial sync.
        """
        def classify(ns_tuple, large_colls, small_colls):
            """ Find out large and small collections.
            """
            if self._is_large_collection(ns_tuple):
                points = self._split_coll(ns_tuple, self._n_workers)
                if points:
                    large_colls.append((ns_tuple, points))
                else:
                    small_colls.append(ns_tuple)
            else:
                small_colls.append(ns_tuple)

        large_colls = []
        small_colls = []

        pool = gevent.pool.Pool(8)
        colls = self._collect_colls()
        for ns in colls:
            dbname, collname = ns
            log.info('%d\t%s.%s' %
                     (self._src.client()[dbname][collname].count(), dbname,
                      collname))
            pool.spawn(classify, ns, large_colls, small_colls)
        pool.join()

        if len(large_colls) + len(small_colls) != len(colls):
            raise RuntimeError('classify collections error')

        log.info('large collections: %s' %
                 ['.'.join(ns) for ns, points in large_colls])
        log.info('small collections: %s' %
                 ['.'.join(ns) for ns in small_colls])

        # create progress logger
        self._progress_logger = LoggerThread(len(colls))
        self._progress_logger.start()

        # small collections first
        pool = gevent.pool.Pool(8)
        for res in pool.imap(self._sync_collection, small_colls):
            if res is not None:
                sys.exit(1)

        # then large collections
        for ns, points in large_colls:
            self._sync_large_collection(ns, points)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        raise NotImplementedError(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._sync_collection.__name__))

    def _is_large_collection(self, namespace_tuple):
        """ Check if large collection or not.
        """
        dbname, collname = namespace_tuple
        return True if self._src.client()[dbname][collname].count(
        ) > self._large_coll_docs else False

    def _sync_large_collection(self, namespace_tuple, split_points):
        """ Sync large collection until success.
        """
        raise NotImplementedError(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._sync_large_collection.__name__))

    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        raise NotImplementedError(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._replay_oplog.__name__))

    def _log_progress(self, tag=''):
        """ Print progress periodically.
        """
        now = time.time()
        if now - self._last_logtime >= self._log_interval:
            delay = now - self._last_optime.time
            time_unit = 'second' if delay <= 1 else 'seconds'
            if tag:
                log.info(
                    '%s - sync to %s - %d %s delay - %s - %s' %
                    (self.from_to,
                     datetime.datetime.fromtimestamp(self._last_optime.time),
                     delay, time_unit, self._last_optime, tag))
            else:
                log.info(
                    '%s - sync to %s - %d %s delay - %s' %
                    (self.from_to,
                     datetime.datetime.fromtimestamp(self._last_optime.time),
                     delay, time_unit, self._last_optime))
            self._last_logtime = now

    def _log_optime(self, optime):
        """ Record optime periodically.
        """
        if not self._optime_logger:
            return
        now = time.time()
        if now - self._last_optime_logtime >= self._optime_log_interval:
            self._optime_logger.write(optime)
            self._last_optime_logtime = now
            log.info("flush optime into file '%s': %s" %
                     (self._optime_logger.filepath, optime))
Beispiel #6
0
    def load(filepath):
        """ Load config file and generate conf.
        """
        conf = Config()
        tml = toml.load(filepath)
        conf.src_conf = MongoConfig(tml['src']['hosts'],
                                    tml['src'].get('authdb', 'admin'),
                                    tml['src'].get('username', ''),
                                    tml['src'].get('password', ''))

        if type not in tml['dst'] or tml['dst']['type'] == 'mongo':
            conf.dst_conf = MongoConfig(tml['dst']['hosts'],
                                        tml['dst'].get('authdb', 'admin'),
                                        tml['dst'].get('username', ''),
                                        tml['dst'].get('password', ''))
        elif tml['dst']['type'] == 'es':
            conf.dst_conf = EsConfig(tml['dst']['hosts'])
        else:
            raise Exception('invalid dst.type')

        if 'sync' in tml and 'dbs' in tml['sync']:
            for dbentry in tml['sync']['dbs']:
                if 'db' not in dbentry:
                    raise Exception("'db' is missing in sync.dbs")
                if not dbentry['db']:
                    raise Exception("'db' is empty in sync.dbs")
                dbname = dbentry['db'].strip()
                rename_db = dbentry['rename_db'].strip(
                ) if 'rename_db' in dbentry else ""

                # update db map
                if dbname and rename_db:
                    if dbname in conf.dbmap:
                        raise Exception('duplicate dbname in sync.dbs: %s' %
                                        dbname)
                    conf.dbmap[dbname] = rename_db

                if 'colls' in dbentry and dbentry['colls']:
                    for collentry in dbentry['colls']:
                        if isinstance(collentry, str):
                            collname = collentry.strip()
                            ns = gen_namespace(dbname, collname)
                            conf.data_filter.add_include_coll(ns)
                        elif isinstance(collentry, dict):
                            if 'coll' not in collentry:
                                raise Exception(
                                    "'coll' is missing in sync.dbs.colls")
                            if not collentry['coll']:
                                raise Exception(
                                    "'coll' is empty in sync.dbs.colls")

                            collname = collentry['coll'].strip()
                            fields = frozenset(
                                [f.strip()
                                 for f in collentry['fields']] if 'fields' in
                                collentry else [])

                            # update coll filter
                            ns = gen_namespace(dbname, collname)
                            conf.data_filter.add_include_coll(ns)

                            # update fields
                            if fields:
                                if ns in conf.fieldmap:
                                    raise Exception(
                                        "duplicate collname in sync.dbs.colls: %s"
                                        % ns)
                                conf.fieldmap[ns] = fields
                        else:
                            raise Exception(
                                'invalid entry in sync.dbs.colls: %s' %
                                collentry)
                else:
                    # update coll filter
                    conf.data_filter.add_include_coll(
                        gen_namespace(dbname, '*'))

        if 'sync' in tml and 'start_optime' in tml['sync']:
            conf.start_optime = Timestamp(tml['sync']['start_optime'], 0)

        if 'log' in tml and 'filepath' in tml['log']:
            conf.logfilepath = tml['log']['filepath']

        if 'log' in tml and 'op_time_path' in tml['log']:
            conf.optime_logfilepath = tml['log']['op_time_path']
            optime_logger = OptimeLogger(conf.optime_logfilepath)
            if optime_logger.read():
                conf.start_optime = optime_logger.read()

        return conf
Beispiel #7
0
class Synchronizer(object):
    """ Common synchronizer.

    Other synchronizer entities should implement methods:
        - __init__
        - __del__
        - _sync_database
        - _sync_collection
        - _sync_oplog
    """
    def __init__(self, conf):
        if not isinstance(conf, Config):
            raise Exception('invalid config type')
        self._conf = conf

        self._ignore_dbs = ['admin', 'local']
        self._ignore_colls = [
            'system.indexes', 'system.profile', 'system.users'
        ]

        if conf.optime_logfilepath:
            self._optime_logger = OptimeLogger(conf.optime_logfilepath)
        else:
            self._optime_logger = None
        self._optime_log_interval = 10  # default 10s
        self._last_optime = None  # optime of the last oplog has been replayed
        self._last_optime_logtime = time.time()

        self._log_interval = 2  # default 2s
        self._last_logtime = time.time()  # use in oplog replay

    @property
    def from_to(self):
        return "%s => %s" % (self._conf.src_hostportstr,
                             self._conf.dst_hostportstr)

    @property
    def log_interval(self):
        return self._log_interval

    @log_interval.setter
    def log_interval(self, n_secs):
        if n_secs < 0:
            n_secs = 0
        self._log_interval = n_secs

    def run(self):
        """ Start to sync.
        """
        # never drop database automatically
        # you should clear the databases manually if necessary
        try:
            self._sync()
        except exceptions.KeyboardInterrupt:
            log.info('keyboard interrupt')

    def _sync(self):
        """ Sync databases and oplog.
        """
        if self._conf.start_optime:
            # TODO optimize
            log.info("locating oplog, it will take a while")
            oplog_start = self._conf.start_optime
            doc = self._src.client()['local']['oplog.rs'].find_one(
                {'ts': {
                    '$gte': oplog_start
                }})
            if not doc:
                log.error('no oplogs newer than the specified oplog')
                return
            oplog_start = doc['ts']
            log.info('start timestamp is %s actually' % oplog_start)
            self._last_optime = oplog_start
            self._sync_oplog(oplog_start)
        else:
            oplog_start = get_optime(self._src.client())
            if not oplog_start:
                log.error('get oplog_start failed, terminate')
                sys.exit(1)
            self._last_optime = oplog_start
            self._sync_databases()
            if self._optime_logger:
                self._optime_logger.write(oplog_start)
                log.info('first %s' % oplog_start)
            self._sync_oplog(oplog_start)

    def _sync_databases(self):
        """ Sync databases excluding 'admin' and 'local'.
        """
        host, port = self._src.client().address
        log.info('sync databases from %s:%d' % (host, port))
        for dbname in self._src.client().database_names():
            if dbname in self._ignore_dbs:
                log.info("skip database '%s'" % dbname)
                continue
            if not self._conf.data_filter.valid_db(dbname):
                log.info("skip database '%s'" % dbname)
                continue
            self._sync_database(dbname)
        log.info('all databases done')

    def _sync_database(self, dbname):
        """ Sync a database.
        """
        raise Exception(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._sync_database.__name__))

    def _sync_collections(self, dbname):
        """ Sync collections in the database excluding system collections.
        """
        collnames = self._src.client()[dbname].collection_names(
            include_system_collections=False)
        for collname in collnames:
            if collname in self._ignore_colls:
                log.info("skip collection '%s'" %
                         gen_namespace(dbname, collname))
                continue
            if not self._conf.data_filter.valid_coll(dbname, collname):
                log.info("skip collection '%s'" %
                         gen_namespace(dbname, collname))
                continue
            self._sync_collection(dbname, collname)

    def _sync_collection(self, dbname, collname):
        """ Sync a collection until success.
        """
        raise Exception(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._sync_collection.__name__))

    def _sync_oplog(self, oplog_start):
        """ Replay oplog.
        """
        raise Exception('you should implement %s.%s' %
                        (self.__class__.__name__, self._sync_oplog.__name__))

    def _log_progress(self, tag=''):
        """ Print progress.
        """
        now = time.time()
        if now - self._last_logtime >= self._log_interval:
            delay = now - self._last_optime.time
            time_unit = 'second' if delay <= 1 else 'seconds'
            if tag:
                log.info(
                    '%s - sync to %s - %d %s delay - %s - %s' %
                    (self.from_to,
                     datetime.datetime.fromtimestamp(self._last_optime.time),
                     delay, time_unit, self._last_optime, tag))
            else:
                log.info(
                    '%s - sync to %s - %d %s delay - %s' %
                    (self.from_to,
                     datetime.datetime.fromtimestamp(self._last_optime.time),
                     delay, time_unit, self._last_optime))
            self._last_logtime = now

    def _log_optime(self, optime):
        """ Record optime.
        """
        if not self._optime_logger:
            return
        now = time.time()
        if now - self._last_optime_logtime >= self._optime_log_interval:
            self._optime_logger.write(optime)
            self._last_optime_logtime = now
            log.info("flush optime into file '%s': %s" %
                     (self._optime_logger.filepath, optime))