Example #1
0
    def _setup_misc(self):
        # misc setup components that were in goissip_sgd
        config = self.config
        state = {}
        update_state(
            state, {
                'epoch': 0,
                'itr': 0,
                'best_prec1': 0,
                'is_best': True,
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'elapsed_time': 0,
                'batch_meter': Meter(ptag='Time').__dict__,
                'data_meter': Meter(ptag='Data').__dict__,
                'nn_meter': Meter(ptag='Forward/Backward').__dict__
            })
        self.state = state

        # module used to relaunch jobs and handle external termination signals
        ClusterManager.set_checkpoint_dir(config['checkpoint_dir'])
        self.cmanager = ClusterManager(rank=config['rank'],
                                       world_size=config['world_size'],
                                       model_tag=config['tag'],
                                       state=state,
                                       all_workers=config['checkpoint_all'])

        # enable low-level optimization of compute graph using cuDNN library?
        cudnn.benchmark = True

        self.batch_meter = Meter(state['batch_meter'])
        self.data_meter = Meter(state['data_meter'])
        self.nn_meter = Meter(state['nn_meter'])

        # initalize log file
        if not os.path.exists(config['out_fname']):
            with open(config['out_fname'], 'w') as f:
                print('BEGIN-TRAINING\n'
                      'World-Size,{ws}\n'
                      'Num-DLWorkers,{nw}\n'
                      'Batch-Size,{bs}\n'
                      'Epoch,itr,BT(s),avg:BT(s),std:BT(s),'
                      'NT(s),avg:NT(s),std:NT(s),'
                      'DT(s),avg:DT(s),std:DT(s),'
                      'Loss,avg:Loss,Prec@1,avg:Prec@1,Prec@5,avg:Prec@5,val'.
                      format(ws=config['world_size'],
                             nw=config['num_dataloader_workers'],
                             bs=config['batch_size']),
                      file=f)

        self.start_itr = state['itr']
        self.start_epoch = state['epoch']
        self.elapsed_time = state['elapsed_time']
        self.begin_time = time.time() - state['elapsed_time']
        self.best_val_prec1 = 0
Example #2
0
def parse_args():
    """
    Set env-vars and global args
        rank: <-- $SLRUM_PROCID
        world_size<-- $SLURM_NTASKS
        Master address <-- $SLRUM_NODENAME of rank 0 process (or HOSTNAME)
        Master port <-- any free port (doesn't really matter)
    """
    class DataStore():
        def __init__(self):

            self.all_reduce = 'False'
            self.batch_size = 32
            self.lr = 0.1
            self.num_dataloader_workers = 10
            self.num_epochs = 90
            self.num_iterations_per_training_epoch = None
            self.momentum = 0.9
            self.weight_decay = 1e-4
            self.push_sum = 'True'
            self.graph_type = 5
            self.mixing_strategy = 0
            self.schedule = None
            self.peers_per_itr_schedule = None
            self.overlap = 'False'
            self.synch_freq = 0
            self.warmup = 'False'
            self.seed = 47
            self.print_freq = 10
            self.checkpoint_all = 'False'
            self.overwrite_checkpoints = 'True'
            self.master_port = '40100'
            self.checkpoint_dir = "./"
            self.network_interface_type = 'infiniband'
            self.num_itr_ignore = 10
            # self.dataset_dir = "./data/"
            self.no_cuda_streams = None

            self.master_addr = None
            self.backend = 'nccl'

            self.rank = 1
            self.world_size = 5
            self.tag = ''
            self.out_fname = ''
            self.resume = 'False'
            self.verbose = 'True'
            self.train_fast = 'False'
            self.nesterov = 'False'

    args = DataStore()  #parser.parse_args()
    ClusterManager.set_checkpoint_dir(args.checkpoint_dir)

    # rank and world_size need to be changed depending on the scheduler being
    # used to run the distributed jobs
    args.master_addr = os.environ['HOSTNAME']
    if args.backend == 'mpi':
        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
        args.world_size = int(os.environ['OMPI_UNIVERSE_SIZE'])
    else:
        args.rank = 1  #int(os.environ['SLURM_PROCID'])
        args.world_size = 5  #int(os.environ['SLURM_NTASKS'])
    args.out_fname = ClusterManager.CHECKPOINT_DIR \
        + args.tag \
        + 'out_r' + str(args.rank) \
        + '_n' + str(args.world_size) \
        + '.csv'
    args.resume = True if args.resume == 'True' else False
    args.verbose = True if args.verbose == 'True' else False
    args.train_fast = True if args.train_fast == 'True' else False
    args.nesterov = True if args.nesterov == 'True' else False
    args.checkpoint_all = True if args.checkpoint_all == 'True' else False
    args.warmup = True if args.warmup == 'True' else False
    args.overlap = True if args.overlap == 'True' else False
    args.push_sum = True if args.push_sum == 'True' else False
    args.all_reduce = True if args.all_reduce == 'True' else False
    args.cpu_comm = True if (args.backend == 'gloo' and not args.push_sum
                             and not args.all_reduce) else False
    args.comm_device = torch.device('cpu') if args.cpu_comm else torch.device(
        'cuda')
    args.overwrite_checkpoints = True if args.overwrite_checkpoints == 'True' else False
    args.lr_schedule = {}
    if args.schedule is None:
        args.schedule = [30, 0.1, 60, 0.1, 80, 0.1]
    i, epoch = 0, None
    for v in args.schedule:
        if i == 0:
            epoch = v
        elif i == 1:
            args.lr_schedule[epoch] = v
        i = (i + 1) % 2
    del args.schedule

    # parse peers per itr sched (epoch, num_peers)
    args.ppi_schedule = {}
    if args.peers_per_itr_schedule is None:
        args.peers_per_itr_schedule = [0, 1]
    i, epoch = 0, None
    for v in args.peers_per_itr_schedule:
        if i == 0:
            epoch = v
        elif i == 1:
            args.ppi_schedule[epoch] = v
        i = (i + 1) % 2
    del args.peers_per_itr_schedule
    # must specify how many peers to communicate from the start of training
    assert 0 in args.ppi_schedule

    if args.all_reduce:
        assert args.graph_type == -1

    if args.backend == 'gloo':
        assert args.network_interface_type == 'ethernet'
        os.environ['GLOO_SOCKET_IFNAME'] = get_tcp_interface_name(
            network_interface_type=args.network_interface_type)
    elif args.network_interface_type == 'ethernet':
        if args.backend == 'nccl':
            os.environ['NCCL_SOCKET_IFNAME'] = get_tcp_interface_name(
                network_interface_type=args.network_interface_type)
            os.environ['NCCL_IB_DISABLE'] = '1'
        else:
            raise NotImplementedError

    # initialize torch distributed backend
    os.environ['MASTER_ADDR'] = args.master_addr
    os.environ['MASTER_PORT'] = args.master_port
    dist.init_process_group(backend=args.backend,
                            world_size=args.world_size,
                            rank=args.rank)

    args.graph, args.mixing = None, None
    graph_class = GRAPH_TOPOLOGIES[args.graph_type]
    if graph_class:
        # dist.barrier is done here to ensure the NCCL communicator is created
        # here. This prevents an error which may be caused if the NCCL
        # communicator is created at a time gap of more than 5 minutes in
        # different processes
        dist.barrier()
        args.graph = graph_class(args.rank,
                                 args.world_size,
                                 peers_per_itr=args.ppi_schedule[0])

    mixing_class = MIXING_STRATEGIES[args.mixing_strategy]
    if mixing_class and args.graph:
        args.mixing = mixing_class(args.graph, args.comm_device)

    return args
Example #3
0
def parse_args():
    """
    Set env-vars and global args
        rank: <-- $SLRUM_PROCID
        world_size<-- $SLURM_NTASKS
        Master address <-- $SLRUM_NODENAME of rank 0 process (or HOSTNAME)
        Master port <-- any free port (doesn't really matter)
    """
    args = parser.parse_args()
    ClusterManager.set_checkpoint_dir(args.checkpoint_dir)

    args.master_addr = os.environ['HOSTNAME']
    if args.backend == 'mpi':
        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
        args.world_size = int(os.environ['OMPI_UNIVERSE_SIZE'])
        args.device_id = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
    else:
        args.rank = int(os.environ['SLURM_PROCID'])
        args.world_size = int(os.environ['SLURM_NTASKS'])
        args.device_id = int(os.environ['SLURM_LOCALID'])

    args.out_fname = ClusterManager.CHECKPOINT_DIR \
        + args.tag \
        + 'out_r' + str(args.rank) \
        + '_n' + str(args.world_size) \
        + '.csv'
    args.resume = True if args.resume == 'True' else False
    args.verbose = True if args.verbose == 'True' else False
    args.train_fast = True if args.train_fast == 'True' else False
    args.nesterov = True if args.nesterov == 'True' else False
    args.checkpoint_all = True if args.checkpoint_all == 'True' else False
    args.warmup = True if args.warmup == 'True' else False
    args.cpu_comm = True if args.backend == 'gloo' else False
    args.comm_device = torch.device('cpu') if args.cpu_comm else torch.device(
        'cuda')
    args.overlap = True if args.overlap == 'True' else False
    args.push_sum = True if args.push_sum == 'True' else False
    args.all_reduce = True if args.all_reduce == 'True' else False
    args.bilat = True if args.bilat == 'True' else False
    args.global_epoch = None
    args.global_itr = None
    if args.rank == 0 and os.path.isfile(args.shared_fpath):
        os.remove(args.shared_fpath)
    while os.path.isfile(args.shared_fpath):
        pass
    args.lr_schedule = {}
    if args.schedule is None:
        args.schedule = [30, 0.1, 60, 0.1, 80, 0.1]
    i, epoch = 0, None
    for v in args.schedule:
        if i == 0:
            epoch = v
        elif i == 1:
            args.lr_schedule[epoch] = v
        i = (i + 1) % 2
    del args.schedule

    # parse peers per itr sched (epoch, num_peers)
    args.ppi_schedule = {}
    if args.peers_per_itr_schedule is None:
        args.peers_per_itr_schedule = [0, 1]
    i, epoch = 0, None
    for v in args.peers_per_itr_schedule:
        if i == 0:
            epoch = v
        elif i == 1:
            args.ppi_schedule[epoch] = v
        i = (i + 1) % 2
    del args.peers_per_itr_schedule
    # must specify how many peers to communicate from the start of training
    assert 0 in args.ppi_schedule

    if args.backend == 'gloo':
        assert args.network_interface_type == 'ethernet'
        os.environ['GLOO_SOCKET_IFNAME'] = get_tcp_interface_name(
            network_interface_type=args.network_interface_type)
    elif args.network_interface_type == 'ethernet':
        if args.backend == 'nccl':
            os.environ['NCCL_SOCKET_IFNAME'] = get_tcp_interface_name(
                network_interface_type=args.network_interface_type)
            os.environ['NCCL_IB_DISABLE'] = '1'
        else:
            raise NotImplementedError

    # initialize torch distributed backend
    os.environ['MASTER_ADDR'] = args.master_addr
    os.environ['MASTER_PORT'] = str(int(args.master_port) + 1)
    dist.init_process_group(backend=args.backend,
                            world_size=args.world_size,
                            rank=args.rank)

    args.graph_class = GRAPH_TOPOLOGIES[args.graph_type]
    args.mixing_class = MIXING_STRATEGIES[args.mixing_strategy]

    if args.graph_class is None:
        raise Exception('Incorrect arguments for graph_type')
    if args.mixing_class is None:
        raise Exception('Incorrect arguments for mixing_strategy')

    return args