Ejemplo n.º 1
0
    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1]
        idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname)
        fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname))

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename))
                cursor = self._src.client()[src_dbname][src_collname].find(filter=None,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                actions = []
                actions_max = 20
                groups = []
                groups_max = 10

                for doc in cursor:
                    id = str(doc['_id'])
                    del doc['_id']
                    source = gen_doc_with_fields(doc, fields) if fields else doc
                    if source:
                        actions.append(
                            {'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source})
                    if len(actions) == actions_max:
                        groups.append(actions)
                        actions = []
                    if len(groups) == groups_max:
                        threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)]
                        gevent.joinall(threads, raise_error=True)
                        groups = []

                    n += 1
                    if n % 1000 == 0:
                        log.info(
                            '    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n) / count * 100))

                if len(groups) > 0:
                    threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))]
                    gevent.joinall(threads, raise_error=True)
                if len(actions) > 0:
                    elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions)

                log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n) / count * 100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()
Ejemplo n.º 2
0
    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1]
        idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname)
        fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname))

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename))
                cursor = self._src.client()[src_dbname][src_collname].find(filter=None,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                actions = []
                actions_max = 20
                groups = []
                groups_max = 10

                for doc in cursor:
                    id = str(doc['_id'])
                    del doc['_id']
                    source = gen_doc_with_fields(doc, fields) if fields else doc
                    if source:
                        actions.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source})
                    if len(actions) == actions_max:
                        groups.append(actions)
                        actions = []
                    if len(groups) == groups_max:
                        threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    if n % 1000 == 0:
                        log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))

                if len(groups) > 0:
                    threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))]
                    gevent.joinall(threads)
                if len(actions) > 0:
                    elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions)

                log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()
Ejemplo n.º 3
0
    def _sync_oplog(self, oplog_start):
        """ Replay oplog.
        """
        self._last_bulk_optime = oplog_start

        n_total = 0
        n_skip = 0

        while True:
            # try to get cursor until success
            try:
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' %
                         (self._last_bulk_optime, host, port))
                # set codec options to guarantee the order of keys in command
                coll = self._src.client()['local'].get_collection(
                    'oplog.rs',
                    codec_options=bson.codec_options.CodecOptions(
                        document_class=bson.son.SON))
                cursor = coll.find(
                    {'ts': {
                        '$gte': oplog_start
                    }},
                    cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT,
                    no_cursor_timeout=True)

                # New in version 3.2
                # src_version = mongo_utils.get_version(self._src.client())
                # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'):
                #     cursor.max_await_time_ms(1000)

                valid_start_optime = False  # need to validate

                while True:
                    try:
                        if not cursor.alive:
                            log.error('cursor is dead')
                            raise pymongo.errors.AutoReconnect

                        oplog = cursor.next()
                        n_total += 1

                        if not valid_start_optime:
                            if oplog['ts'] == oplog_start:
                                log.info('oplog is ok: %s' % oplog_start)
                                valid_start_optime = True
                            else:
                                log.error('oplog %s is stale, terminate' %
                                          oplog_start)
                                return

                        # validate oplog
                        if not self._conf.data_filter.valid_oplog(oplog):
                            n_skip += 1
                            self._last_optime = oplog['ts']
                            continue

                        op = oplog['op']
                        ns = oplog['ns']

                        if op == 'i':  # insert
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            fields = self._conf.fieldmap.get(
                                gen_namespace(dbname, collname))

                            doc = oplog['o']
                            id = str(doc['_id'])
                            del doc['_id']
                            if fields:
                                doc = gen_doc_with_fields(doc, fields)
                            if doc:
                                self._action_buf.append({
                                    '_op_type': 'index',
                                    '_index': idxname,
                                    '_type': typename,
                                    '_id': id,
                                    '_source': doc
                                })

                        elif op == 'u':  # update
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            fields = self._conf.fieldmap.get(
                                gen_namespace(dbname, collname))

                            id = str(oplog['o2']['_id'])

                            if '$set' in oplog['o']:
                                doc = {}
                                for k, v in oplog['o']['$set'].iteritems():
                                    if not fields or k in fields:
                                        sub_doc = doc_flat_to_nested(
                                            k.split('.'), v)
                                        merge_doc(doc, sub_doc)
                                if doc:
                                    self._action_buf.append({
                                        '_op_type':
                                        'update',
                                        '_index':
                                        idxname,
                                        '_type':
                                        typename,
                                        '_id':
                                        id,
                                        '_retry_on_conflict':
                                        3,
                                        'doc':
                                        doc,
                                        'doc_as_upsert':
                                        True
                                    })

                            if '$unset' in oplog['o']:
                                script_statements = []
                                for keypath in oplog['o']['$unset'].iterkeys():
                                    if not fields or keypath in fields:
                                        pos = keypath.rfind('.')
                                        if pos >= 0:
                                            script_statements.append(
                                                'ctx._source.%s.remove("%s")' %
                                                (keypath[:pos],
                                                 keypath[pos + 1:]))
                                        else:
                                            script_statements.append(
                                                'ctx._source.remove("%s")' %
                                                keypath)
                                if script_statements:
                                    doc = {
                                        'script': '; '.join(script_statements)
                                    }
                                    self._action_buf.append({
                                        '_op_type':
                                        'update',
                                        '_index':
                                        idxname,
                                        '_type':
                                        typename,
                                        '_id':
                                        id,
                                        '_retry_on_conflict':
                                        3,
                                        'script':
                                        doc['script']
                                    })

                            if '$set' not in oplog[
                                    'o'] and '$unset' not in oplog['o']:
                                log.warn('unexpect oplog: %s', oplog['o'])

                        elif op == 'd':  # delete
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            id = str(oplog['o']['_id'])
                            self._action_buf.append({
                                '_op_type': 'delete',
                                '_index': idxname,
                                '_type': typename,
                                '_id': id
                            })

                        elif op == 'c':  # command
                            dbname, _ = parse_namespace(ns)
                            idxname = self._conf.db_mapping(dbname)
                            if 'drop' in oplog['o']:
                                # TODO
                                # how to delete type?
                                pass
                                log.warn(
                                    'you should implement document type deletion.'
                                )
                            if 'dropDatabase' in oplog['o']:
                                # delete index
                                self._dst.client().indices.delete(
                                    index=idxname)

                        elif op == 'n':  # no-op
                            pass
                        else:
                            log.error('invalid optype: %s' % oplog)

                        # flush
                        if self._action_buf_full():
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = oplog['ts']

                        self._last_optime = oplog['ts']
                        self._log_optime(oplog['ts'])
                        self._log_progress()
                    except StopIteration as e:
                        # flush
                        if len(self._action_buf) > 0:
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = self._last_optime
                        self._log_optime(self._last_optime)
                        self._log_progress('latest')
                        time.sleep(0.1)
                    except pymongo.errors.AutoReconnect as e:
                        log.error(e)
                        self._src.reconnect()
                        break
                    except elasticsearch.helpers.BulkIndexError as e:
                        log.error(e)
                        self._action_buf = []
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % oplog_start)
                return
Ejemplo n.º 4
0
    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        self._last_bulk_optime = oplog_start

        n_total = 0
        n_skip = 0

        while True:
            # try to get cursor until success
            try:
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port))
                # set codec options to guarantee the order of keys in command
                coll = self._src.client()['local'].get_collection('oplog.rs',
                                                                  codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON))
                cursor = coll.find({'ts': {'$gte': oplog_start}},
                                   cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT,
                                   no_cursor_timeout=True)

                # New in version 3.2
                # src_version = mongo_utils.get_version(self._src.client())
                # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'):
                #     cursor.max_await_time_ms(1000)

                valid_start_optime = False  # need to validate

                while True:
                    try:
                        if not cursor.alive:
                            log.error('cursor is dead')
                            raise pymongo.errors.AutoReconnect

                        oplog = cursor.next()
                        n_total += 1

                        if not valid_start_optime:
                            if oplog['ts'] == oplog_start:
                                log.info('oplog is ok: %s' % oplog_start)
                                valid_start_optime = True
                            else:
                                log.error('oplog %s is stale, terminate' % oplog_start)
                                return

                        # validate oplog
                        if not self._conf.data_filter.valid_oplog(oplog):
                            n_skip += 1
                            self._last_optime = oplog['ts']
                            continue

                        op = oplog['op']
                        ns = oplog['ns']

                        if op == 'i':  # insert
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            fields = self._conf.fieldmap.get(gen_namespace(dbname, collname))

                            doc = oplog['o']
                            id = str(doc['_id'])
                            del doc['_id']
                            if fields:
                                doc = gen_doc_with_fields(doc, fields)
                            if doc:
                                self._action_buf.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc})

                        elif op == 'u':  # update
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            fields = self._conf.fieldmap.get(gen_namespace(dbname, collname))

                            id = str(oplog['o2']['_id'])

                            if '$set' in oplog['o']:
                                doc = {}
                                for k, v in oplog['o']['$set'].iteritems():
                                    if not fields or k in fields:
                                        sub_doc = doc_flat_to_nested(k.split('.'), v)
                                        merge_doc(doc, sub_doc)
                                if doc:
                                    self._action_buf.append({'_op_type': 'update',
                                                             '_index': idxname,
                                                             '_type': typename,
                                                             '_id': id,
                                                             '_retry_on_conflict': 3,
                                                             'doc': doc,
                                                             'doc_as_upsert': True})

                            if '$unset' in oplog['o']:
                                script_statements = []
                                for keypath in oplog['o']['$unset'].iterkeys():
                                    if not fields or keypath in fields:
                                        pos = keypath.rfind('.')
                                        if pos >= 0:
                                            script_statements.append('ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos+1:]))
                                        else:
                                            script_statements.append('ctx._source.remove("%s")' % keypath)
                                if script_statements:
                                    doc = {'script': '; '.join(script_statements)}
                                    self._action_buf.append({'_op_type': 'update',
                                                             '_index': idxname,
                                                             '_type': typename,
                                                             '_id': id,
                                                             '_retry_on_conflict': 3,
                                                             'script': doc['script']})

                            if '$set' not in oplog['o'] and '$unset' not in oplog['o']:
                                log.warn('unexpect oplog: %s', oplog['o'])

                        elif op == 'd':  # delete
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            id = str(oplog['o']['_id'])
                            self._action_buf.append({'_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id})

                        elif op == 'c':  # command
                            dbname, _ = parse_namespace(ns)
                            idxname = self._conf.db_mapping(dbname)
                            if 'drop' in oplog['o']:
                                # TODO
                                # how to delete type?
                                pass
                                log.warn('you should implement document type deletion.')
                            if 'dropDatabase' in oplog['o']:
                                # delete index
                                self._dst.client().indices.delete(index=idxname)

                        elif op == 'n':  # no-op
                            pass
                        else:
                            log.error('invalid optype: %s' % oplog)

                        # flush
                        if self._action_buf_full():
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = oplog['ts']

                        self._last_optime = oplog['ts']
                        self._log_optime(oplog['ts'])
                        self._log_progress()
                    except StopIteration as e:
                        # flush
                        if len(self._action_buf) > 0:
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = self._last_optime
                        self._log_optime(self._last_optime)
                        self._log_progress('latest')
                        time.sleep(0.1)
                    except pymongo.errors.AutoReconnect as e:
                        log.error(e)
                        self._src.reconnect()
                        break
                    except elasticsearch.helpers.BulkIndexError as e:
                        log.error(e)
                        self._action_buf = []
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % oplog_start)
                return