def update_ttls_parent(sources, state_db, args): process_names = { repr(source): "%s:%d" % (source['host'], source['port']) for source in sources } processes = [] for source in sources: name = process_names[repr(source)] process = multiprocessing.Process(target=update_ttls, name=name, kwargs=dict( source=source, state_path=state_db._path, seconds=args.seconds)) process.start() processes.append(process) utils.wait_for_processes(processes)
def copy_collection_parent(sources, dest, state_db, args): """ drive the collection copying process by delegating work to a pool of worker processes """ # ensure state db has rows for each source/dest pair for source in sources: state_db.add_source_and_dest(source, dest) # space-pad all process names so that tabular output formats line up process_names = {repr(source): "%s:%d" % (source['host'], source['port']) for source in sources} process_names['parent'] = PARENT_PROCESS_NAME max_process_name_len = max(len(name) for name in process_names.itervalues()) for key in process_names: process_names[key] = string.ljust(process_names[key], max_process_name_len) multiprocessing.current_process().name = process_names['parent'] # ----------------------------------------------------------------------- # perform initial copy, if it hasn't been done yet # ----------------------------------------------------------------------- in_initial_copy = len(state_db.select_by_state(CopyStateDB.STATE_INITIAL_COPY)) if in_initial_copy and in_initial_copy < len(sources): die("prior attempt at initial copy failed; rerun with --restart") if in_initial_copy > 0: ensure_empty_dest(dest) # each worker process copies one shard processes = [] for source in sources: name = process_names[repr(source)] process = multiprocessing.Process(target=copier.copy_collection, name=name, kwargs=dict(source=source, dest=dest, state_path=state_db._path, percent=args.percent)) process.start() processes.append(process) # wait for all workers to finish utils.wait_for_processes(processes) # ----------------------------------------------------------------------- # build indices on main process, since that only needs to be done once # ----------------------------------------------------------------------- waiting_for_indices = len(state_db.select_by_state(CopyStateDB.STATE_WAITING_FOR_INDICES)) if waiting_for_indices and waiting_for_indices < len(sources): die("not all initial copies have been completed; rerun with --restart") if waiting_for_indices > 0: log.info("building indices") copier.copy_indexes(sources[0], dest) for source in sources: state_db.update_state(source, dest, CopyStateDB.STATE_APPLYING_OPLOG) # ----------------------------------------------------------------------- # apply oplogs # ----------------------------------------------------------------------- applying_oplog = state_db.select_by_state(CopyStateDB.STATE_APPLYING_OPLOG) if len(applying_oplog) < len(sources): die("this shouldn't happen!") log.info("starting oplog apply") # create worker thread that prints headers for oplog stats on a regular basis; # we do this to prevent the visual clutter caused by multiple processes doing this # # we avoid using gevent in the parent process to avoid weirdness I've seen with fork()ed # gevent loops header_delay = max(float(20) / len(sources),10) stats_name = string.ljust("stats", max_process_name_len) stats_proc = multiprocessing.Process(target=oplog_applier.print_header_worker, args=(header_delay,), name=stats_name) stats_proc.start() # need to isolate calls to gevent here, to avoid forking with monkey-patched modules # (which seems to create funkiness) processes = [] for source in sources: name = process_names[repr(source)] process = multiprocessing.Process(target=oplog_applier.apply_oplog, name=name, kwargs=dict(source=source, dest=dest, percent=args.percent, state_path=state_db._path)) process.start() processes.append(process) # this should *never* finish processes.append(stats_proc) utils.wait_for_processes(processes)
def copy_collection_parent(sources, dest, state_db, args): """ drive the collection copying process by delegating work to a pool of worker processes """ # ensure state db has rows for each source/dest pair for source in sources: state_db.add_source_and_dest(source, dest) # space-pad all process names so that tabular output formats line up process_names = {repr(source): "%s:%d" % (source["host"], source["port"]) for source in sources} process_names["parent"] = PARENT_PROCESS_NAME max_process_name_len = max(len(name) for name in process_names.itervalues()) for key in process_names: process_names[key] = string.ljust(process_names[key], max_process_name_len) multiprocessing.current_process().name = process_names["parent"] # ----------------------------------------------------------------------- # perform initial copy, if it hasn't been done yet # ----------------------------------------------------------------------- in_initial_copy = len(state_db.select_by_state(CopyStateDB.STATE_INITIAL_COPY)) if in_initial_copy and in_initial_copy < len(sources): die("prior attempt at initial copy failed; rerun with --restart") if in_initial_copy > 0: ensure_empty_dest(dest) # each worker process copies one shard processes = [] for source in sources: name = process_names[repr(source)] process = multiprocessing.Process( target=copier.copy_collection, name=name, kwargs=dict(source=source, dest=dest, state_path=state_db._path, percent=args.percent), ) process.start() processes.append(process) # wait for all workers to finish utils.wait_for_processes(processes) # ----------------------------------------------------------------------- # build indices on main process, since that only needs to be done once # ----------------------------------------------------------------------- waiting_for_indices = len(state_db.select_by_state(CopyStateDB.STATE_WAITING_FOR_INDICES)) if waiting_for_indices and waiting_for_indices < len(sources): die("not all initial copies have been completed; rerun with --restart") if waiting_for_indices > 0: log.info("building indices") copier.copy_indexes(sources[0], dest) for source in sources: state_db.update_state(source, dest, CopyStateDB.STATE_APPLYING_OPLOG) # ----------------------------------------------------------------------- # apply oplogs # ----------------------------------------------------------------------- applying_oplog = state_db.select_by_state(CopyStateDB.STATE_APPLYING_OPLOG) if len(applying_oplog) < len(sources): die("this shouldn't happen!") log.info("starting oplog apply") # create worker thread that prints headers for oplog stats on a regular basis; # we do this to prevent the visual clutter caused by multiple processes doing this # # we avoid using gevent in the parent process to avoid weirdness I've seen with fork()ed # gevent loops header_delay = max(float(20) / len(sources), 10) stats_name = string.ljust("stats", max_process_name_len) stats_proc = multiprocessing.Process( target=oplog_applier.print_header_worker, args=(header_delay,), name=stats_name ) stats_proc.start() # need to isolate calls to gevent here, to avoid forking with monkey-patched modules # (which seems to create funkiness) processes = [] for source in sources: name = process_names[repr(source)] process = multiprocessing.Process( target=oplog_applier.apply_oplog, name=name, kwargs=dict(source=source, dest=dest, percent=args.percent, state_path=state_db._path), ) process.start() processes.append(process) # this should *never* finish processes.append(stats_proc) utils.wait_for_processes(processes)
def copy_collection_parent(manifests, state_db, args): """ drive the collection copying process by delegating work to a pool of worker processes """ # ensure state db has rows for each source/dest pair for manifest in manifests: state_db.add_manifest(manifest) # space-pad all process names so that tabular output formats line up process_names = dict([ (repr(manifest), "%s:%s.%s->%s:%s.%s" % (manifest["srchost"], manifest["srcdb"], manifest["srccol"], manifest["desthost"], manifest["destdb"], manifest["destcol"])) for manifest in manifests ]) process_names['parent'] = PARENT_PROCESS_NAME max_process_name_len = max( len(name) for name in process_names.itervalues()) for key in process_names: process_names[key] = string.ljust(process_names[key], max_process_name_len) #multiprocessing.current_process().name = process_names['parent'] # ----------------------------------------------------------------------- # build indices on main process, since that only needs to be done once # ----------------------------------------------------------------------- waiting_for_indices = len( state_db.select_by_state(CopyStateDB.STATE_WAITING_FOR_INDICES)) if waiting_for_indices and waiting_for_indices < len(manifests): log.warn( "prior attempt maybe failed; you can rerun from scratch with --restart" ) if waiting_for_indices > 0: log.info("building indices") copier.copy_indexes(manifests, args.drop) for manifest in manifests: state_db.update_state(manifest, CopyStateDB.STATE_INITIAL_COPY) # ----------------------------------------------------------------------- # perform initial copy, if it hasn't been done yet # ----------------------------------------------------------------------- in_initial_copy = len( state_db.select_by_state(CopyStateDB.STATE_INITIAL_COPY)) if in_initial_copy and in_initial_copy < len(manifests): log.warn( "prior attempt maybe failed; you can rerun from scratch with --restart" ) if in_initial_copy > 0: # each worker process copies one shard processes = [] for manifest in manifests: name = process_names[repr(manifest)] process = multiprocessing.Process(target=copier.copy_collection, name=name, kwargs=dict( manifest=manifest, state_path=state_db._path, percent=args.percent)) process.start() processes.append(process) # wait for all workers to finish utils.wait_for_processes(processes)
args = parser.parse_args() dest = utils.parse_mongo_url(args.dest) if os.path.exists(args.source): sources = utils.parse_source_file(args.source) else: sources = [utils.parse_mongo_url(args.source)] if args.ids_file and args.recent_ops: raise ValueError( "the --ids-file and --recent-ops parameters cannot be combined") # finally, compare stuff! processes = [] for source in sources: name = "%s:%s" % (source['host'], source['port']) process = Process(target=compare_collections, name=name, kwargs=dict( source=source, dest=dest, percent=args.percent, error_bp=args.error_bp, recent_ops=args.recent_ops, ids_file=args.ids_file, )) process.start() utils.wait_for_processes(processes)
def copy_collection_parent(sources, dest, state_db, args): """ drive the collection copying process by delegating work to a pool of worker processes """ # ensure state db has rows for each source/dest pair for source in sources: state_db.add_source_and_dest(source, dest) # space-pad all process names so that tabular output formats line up process_names = { repr(source): "%s:%d" % (source['host'], source['port']) for source in sources } process_names['parent'] = PARENT_PROCESS_NAME max_process_name_len = max( len(name) for name in process_names.itervalues()) for key in process_names: process_names[key] = string.ljust(process_names[key], max_process_name_len) multiprocessing.current_process().name = process_names['parent'] # ----------------------------------------------------------------------- # perform initial copy, if it hasn't been done yet # ----------------------------------------------------------------------- in_initial_copy = len( state_db.select_by_state(CopyStateDB.STATE_INITIAL_COPY)) if in_initial_copy and in_initial_copy < len(sources): die("prior attempt at initial copy failed; rerun with --restart") if in_initial_copy > 0: ensure_empty_dest(dest) # each worker process copies one shard processes = [] for source in sources: name = process_names[repr(source)] process = multiprocessing.Process(target=copier.copy_collection, name=name, kwargs=dict( source=source, dest=dest, state_path=state_db._path, percent=args.percent)) process.start() processes.append(process) # wait for all workers to finish utils.wait_for_processes(processes) # ----------------------------------------------------------------------- # build indices on main process, since that only needs to be done once # ----------------------------------------------------------------------- waiting_for_indices = len( state_db.select_by_state(CopyStateDB.STATE_WAITING_FOR_INDICES)) if waiting_for_indices and waiting_for_indices < len(sources): die("not all initial copies have been completed; rerun with --restart") if waiting_for_indices > 0: log.info("building indices") copier.copy_indexes(sources[0], dest) for source in sources: state_db.update_state(source, dest, CopyStateDB.STATE_APPLYING_OPLOG)
help='verify documents touched by the last N ops') args = parser.parse_args() dest = utils.parse_mongo_url(args.dest) if os.path.exists(args.source): sources = utils.parse_source_file(args.source) else: sources = [utils.parse_mongo_url(args.source)] if args.ids_file and args.recent_ops: raise ValueError("the --ids-file and --recent-ops parameters cannot be combined") # finally, compare stuff! processes = [] for source in sources: name = "%s:%s" % (source['host'], source['port']) process = Process(target=compare_collections, name=name, kwargs=dict( source=source, dest=dest, percent=args.percent, error_bp=args.error_bp, recent_ops=args.recent_ops, ids_file=args.ids_file, )) process.start() utils.wait_for_processes(processes)