def main(): mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__)) parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if is_main_process: # Setting logging file parameters for compliance logging os.environ["COMPLIANCE_FILE"] = './MASKRCNN_complVv0.5.0_' + str( datetime.datetime.now()) mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE") mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE) mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG) mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER) if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() print_mlperf(key=mlperf_log.RUN_START) # setting seeds - needs to be timed, so after RUN_START if is_main_process(): master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) else: seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) torch.distributed.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: print_mlperf(key=mlperf_log.RUN_START) # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) # actually use the random seed args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds( random_number_generator, torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) print_mlperf(key=mlperf_log.RUN_FINAL)
def main(): configure_logger(constants.MASKRCNN) log_start(key=constants.INIT_START) parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=herring.get_local_rank()) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = herring.get_world_size() args.distributed = num_gpus > 1 args.local_rank = herring.get_local_rank() # if is_main_process: # # Setting logging file parameters for compliance logging # os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(datetime.datetime.now()) # constants.LOG_FILE = os.getenv("COMPLIANCE_FILE") # constants._FILE_HANDLER = logging.FileHandler(constants.LOG_FILE) # constants._FILE_HANDLER.setLevel(logging.DEBUG) # constants.LOGGER.addHandler(constants._FILE_HANDLER) if args.distributed: torch.cuda.set_device(args.local_rank) # setting seeds - needs to be timed, so after RUN_START if is_main_process(): master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) else: seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) herring.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) # actually use the random seed args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) log_event(key=constants.SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds(random_number_generator, herring.get_world_size()) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # Initialise async eval init() model, success = train(cfg, args.local_rank, args.distributed, random_number_generator) if success is not None: if success: log_end(key=constants.RUN_STOP, metadata={"status": "success"}) else: log_end(key=constants.RUN_STOP, metadata={"status": "aborted"})
def main(): mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__)) # mlperf_log.LOGGER.propagate = False parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch/configs/rpn_r50.py", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--skip-test', dest='skip_test', help='Do not test the model', action='store_true') parser.add_argument("--fp16", action="store_true", help="Enable multi-precision training") parser.add_argument("--min_bbox_map", type=float, default=0.377, help="Target BBOX MAP") parser.add_argument("--min_mask_map", type=float, default=0.339, help="Target SEGM/MASK MAP") parser.add_argument("--seed", type=int, default=1, help="Seed") parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.distributed = (int(os.environ["WORLD_SIZE"]) > 1 if "WORLD_SIZE" in os.environ else False) if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") # to synchronize start of time torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() if torch.distributed.get_rank() == 0: # Setting logging file parameters for compliance logging os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str( datetime.datetime.now()) mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE") mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE) mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG) mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER) print_mlperf(key=mlperf_log.RUN_START) # Setting seed seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) if torch.distributed.get_rank() == 0: # seed = int(time.time()) # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) torch.distributed.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: # Setting logging file parameters for compliance logging os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str( datetime.datetime.now()) mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE") mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE) mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG) mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER) print_mlperf(key=mlperf_log.RUN_START) # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) if args.skip_test: cfg.DO_ONLINE_MAP_EVAL = False cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) #logger = setup_logger("maskrcnn_benchmark", output_dir, args.local_rank) logger = setup_logger("maskrcnn_benchmark", None, args.local_rank) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds( random_number_generator, torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, random_number_generator, args.local_rank, args.distributed, args, args.fp16) print_mlperf(key=mlperf_log.RUN_FINAL)
def main(): configure_logger(constants.MASKRCNN) log_start(key=constants.INIT_START) parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # setting seeds - needs to be timed, so after RUN_START if is_main_process(): master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) else: seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) torch.distributed.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) # actually use the random seed args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) log_event(key=constants.SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds( random_number_generator, torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model, success = train(cfg, args.local_rank, args.distributed, args.disable_allreduce_for_logging, random_number_generator) if success is not None: if success: log_end(key=constants.RUN_STOP, metadata={"status": "success"}) else: log_end(key=constants.RUN_STOP, metadata={"status": "aborted"})