コード例 #1
0
def pin_model_to_device(device, model):
    isCUDA = device == "GPU"
    if isCUDA:
        # Bluefog: pin GPU to local rank.
        device_id = (bf.local_rank() if bf.nccl_built() else
                     bf.local_rank() % torch.cuda.device_count())
        torch.cuda.set_device(device_id)
        model.cuda()
    return isCUDA
コード例 #2
0
def hier_setup():
    os.environ['BLUEFOG_NODES_PER_MACHINE'] = '2'
    bf.init()
    assert bf.size() % 2 == 0
    machine_size = int(bf.size() // 2)
    bf.set_machine_topology(bf.ExponentialGraph(machine_size))
    return bf.rank(), bf.size(), bf.local_rank(), bf.local_size()
コード例 #3
0
 def cast_and_place(tensor, dtype):
     if dtype.is_cuda:
         if bf.nccl_built() and bf.local_size() > torch.cuda.device_count():
             raise EnvironmentError(
                 "Cannot run number of processes in one machine more than GPU device count"
                 " in NCCL environment")
         return tensor.cuda(bf.local_rank() %
                            torch.cuda.device_count()).type(dtype)
     return tensor.type(dtype)
コード例 #4
0
parser.add_argument("--save-plot-file",
                    default='average_consensus_plot.png',
                    help="Saving the plot in the file.")
parser.add_argument('--seed',
                    type=int,
                    default=2020,
                    help='Seed for randomness.')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

bf.init()

torch.random.manual_seed(args.seed * bf.rank())
if args.cuda:
    device = bf.local_rank() % torch.cuda.device_count()
    x = torch.randn(args.data_size, device=device, dtype=torch.double)
else:
    x = torch.randn(args.data_size, dtype=torch.double)

if args.virtual_topology == "expo2":
    pass
elif args.virtual_topology == "expo3":
    bf.set_topology(topology_util.ExponentialGraph(bf.size(), base=3))
elif args.virtual_topology == "expo4":
    bf.set_topology(topology_util.ExponentialGraph(bf.size(), base=4))
elif args.virtual_topology == "ring":
    bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
elif args.virtual_topology == "mesh":
    bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=0),
                    is_weighted=True)
コード例 #5
0
parser.add_argument('--batch_size', type=int, default=100,
        help="batch size (default: 100).")
parser.add_argument('--seed', type=int, default=3, 
        help='set seed (default: 3).')
parser.add_argument('--save_name', type=str, required=True, 
        help='The file_postfix to save log')

args = parser.parse_args()
cudnn.benchmark = True
cudnn.enabled = True
torch.manual_seed(args.seed)
np.random.seed(args.seed)

bf.init()

device_id = bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count()
torch.cuda.set_device(device_id)
torch.cuda.manual_seed(args.seed)

kwargs = {"num_workers": 4, "pin_memory": True}

# load the data
if args.dataset == "MNIST":
    train_set, test_set = MNIST_dataset_flat_dist(bf.rank())
    NN_model = MNIST_two_layers
elif args.dataset == "MNIST_Conv":
    train_set, test_set = MNIST_dataset_dist(bf.rank())
    NN_model = LeNet
elif args.dataset == "CIFAR10":
    train_set, test_set = CIFAR10_dataset_dist(bf.rank()) 
    NN_model = vgg11
コード例 #6
0
def log(s, nl=True):
    if bf.local_rank() != 0:
        return
    print(s, end='\n' if nl else '', flush=True)
コード例 #7
0
def test_bluefog_local_rank(hier_setup):
    true_rank, true_size = mpi_env_rank_and_size()
    local_rank = bf.local_rank()
    assert true_rank % min(2, true_size) == local_rank
コード例 #8
0
parser.add_argument('--data-size',
                    type=int,
                    default=2000,
                    help='input data size')
parser.add_argument('--data-dim',
                    type=int,
                    default=500,
                    help='input data dimension')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

bf.init()

if args.cuda:
    torch.cuda.set_device(bf.local_rank())
    cudnn.benchmark = True


def logistic_loss_step(x_, rho, X, y, tensor_name, calculate_by_hand=True):
    """Calculate gradient of logistic loss via pytorch autograd."""

    if calculate_by_hand:
        # prob = torch.exp( -y * X.mm(x_.data))
        prob = torch.exp(-y * torch.matmul(X, x_.data))
        alpha = prob / (1 + prob)
        x_.grad = rho * x_.data - torch.mean(alpha * y * X, dim=0).reshape(
            -1, 1)
        return

    else: