def run(rank, size, epochs, batch_size):
    torch.manual_seed(0)
    numpy.random.seed(0)
    batch_size = int(batch_size / float(dist.get_world_size()))

    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    transform_test = transforms.Compose([transforms.ToTensor(), normalize])
    training_set = datasets.CIFAR10(root="./data",
                                    train=True,
                                    download=True,
                                    transform=transform_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        training_set, num_replicas=size, rank=rank)
    train_loader = torch.utils.data.DataLoader(training_set,
                                               num_workers=2,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               shuffle=False,
                                               pin_memory=True)

    print('Size of training set is {}'.format(len(train_loader)))

    test_set = datasets.CIFAR10(root="./data",
                                train=False,
                                download=True,
                                transform=transform_test)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=2,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              pin_memory=True)
    print('Size of test set is {}'.format(len(test_loader)))

    training_criterion = torch.nn.CrossEntropyLoss().to(device)

    model = mdl.VGG11()
    model.to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=0.0001)
    # running training for one epoch
    for epoch in range(epochs):
        start_time = time.time()
        train_model(model, train_loader, optimizer, training_criterion, rank)
        print('Training time after {} epoch is {}'.format(
            epoch + 1, (time.time() - start_time)))
        test_model(model, test_loader, training_criterion)
def vgg_model(rank, size):
    torch.manual_seed(5000)
    np.random.seed(5000)
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    transform_test = transforms.Compose([transforms.ToTensor(), normalize])
    training_set = datasets.CIFAR10(root="../../data",
                                    train=True,
                                    download=True,
                                    transform=transform_train)

    # training set as distributed dataset
    sampler_d = DistributedSampler(
        training_set) if torch.distributed.is_available() else None
    train_loader = torch.utils.data.DataLoader(
        training_set,
        num_workers=2,
        batch_size=batch_size,
        sampler=sampler_d,
        #shuffle=True,
        pin_memory=True)
    test_set = datasets.CIFAR10(root="../../data",
                                train=False,
                                download=True,
                                transform=transform_test)

    # testing set as distributed dataset
    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=2,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              pin_memory=True)
    training_criterion = torch.nn.CrossEntropyLoss().to(device)

    model = mdl.VGG11()
    model.to(device)
    ddp_model = DDP(model)
    optimizer = optim.SGD(ddp_model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=0.0001)
    # running training for one epoch
    for epoch in range(1):
        train_model(ddp_model, train_loader, optimizer, training_criterion,
                    epoch, rank)
        test_model(ddp_model, test_loader, training_criterion)
def main():
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    transform_test = transforms.Compose([transforms.ToTensor(), normalize])
    training_set = datasets.CIFAR10(root="../../data",
                                    train=True,
                                    download=True,
                                    transform=transform_train)
    train_loader = torch.utils.data.DataLoader(training_set,
                                               num_workers=2,
                                               batch_size=batch_size,
                                               sampler=None,
                                               shuffle=True,
                                               pin_memory=True)
    test_set = datasets.CIFAR10(root="../../data",
                                train=False,
                                download=True,
                                transform=transform_test)

    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=2,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              pin_memory=True)
    training_criterion = torch.nn.CrossEntropyLoss().to(device)

    model = mdl.VGG11()
    model.to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=0.0001)
    # running training for one epoch
    for epoch in range(1):
        train_model(model, train_loader, optimizer, training_criterion, epoch)
        test_model(model, test_loader, training_criterion)
def run(rank, size, outputfile):
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
    """set randomseed"""
    torch.manual_seed(randomseed)
    print('manual_seed=', randomseed)
    """set up data"""
    train_set, bsz = partition_dataset(normalize)

    transform_test = transforms.Compose([transforms.ToTensor(), normalize])

    test_set = datasets.CIFAR10(root="./data",
                                train=False,
                                download=True,
                                transform=transform_test)
    bsz = int(batch_size / float(size))
    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=2,
                                              batch_size=bsz,
                                              shuffle=False,
                                              pin_memory=True)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    """set up model"""
    model = mdl.VGG11()
    model.to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=0.0001)

    num_batches = math.ceil(len(train_set.dataset) / float(bsz))
    """write output to file"""
    if os.path.exists(outputfile):
        os.remove(outputfile)
    fp = open(
        outputfile + "_r" + str(dist.get_rank()) + "_size" +
        str(dist.get_world_size()), "a")
    """start training"""
    total_epoch = 1
    for epoch in range(total_epoch):
        # # training start from here
        running_loss = 0.0
        # remember to exit the train loop at end of the epoch
        for batch_idx, (data, target) in enumerate(train_set):
            if batch_idx < 10:
                start = timeit.default_timer()

            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = model(data)
            loss = criterion(outputs, target)
            running_loss += loss.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
            if batch_idx % 20 == 19:  # print every 20 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, batch_idx + 1, running_loss / 20))
                fp.write('[%d, %5d] loss: %.3f\n' %
                         (epoch + 1, batch_idx + 1, running_loss / 20))
                running_loss = 0.0

            if batch_idx == 0:
                fp.write("Batch\trunning time\n")
            if batch_idx < 10:
                end = timeit.default_timer() - start
                print("Batch " + str(batch_idx) + " running time:" + str(end))
                fp.write('%d\t%.5f\n' % (batch_idx, end))

        # # training stop
        fp.close()

    test_model(model, test_loader, criterion, outputfile)
Beispiel #5
0
def main():

    #pull in parameters and set variables needed for initialization
    args = parse_args()
    print('main.py launched with master ip: ' + str(args.master_ip) +
          ' num nodes: ' + str(args.num_nodes) + ' rank: ' + str(args.rank) +
          ' port: ' + str(args.port))

    world_size = int(args.num_nodes)
    os.environ["GLOO_SOCKET_IFNAME"] = 'eth1'

    batch_size = 256 / world_size  #batch size is 256 across all nodes, so need to divide by number of nodes

    random_seed = 314
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    #initialize distributed group
    td.init_process_group('gloo',
                          init_method='tcp://' + args.master_ip + ':' +
                          args.port,
                          world_size=world_size,
                          rank=int(args.rank))

    #start model setup
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    transform_test = transforms.Compose([transforms.ToTensor(), normalize])
    training_set = datasets.CIFAR10(root="./data",
                                    train=True,
                                    download=True,
                                    transform=transform_train)
    #
    # added for distributed process
    dist_train_sampler = torch.utils.data.distributed.DistributedSampler(
        training_set)
    #
    #add the sampler to the test_loader
    train_loader = torch.utils.data.DataLoader(
        training_set,
        num_workers=2,
        batch_size=batch_size,
        sampler=dist_train_sampler,
        #shuffle=True,
        pin_memory=True)
    test_set = datasets.CIFAR10(root="./data",
                                train=False,
                                download=True,
                                transform=transform_test)

    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=2,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              pin_memory=True)
    training_criterion = torch.nn.CrossEntropyLoss().to(device)

    model = mdl.VGG11()
    model.to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=0.0001)

    #save off model to check if model is initialized the same across nodes
    #torch.save(model.state_dict(), './node'+args.rank+'_'+'initial_model_weights')

    # running training for one epoch
    for epoch in range(1):
        train_model(model, train_loader, optimizer, training_criterion, epoch,
                    world_size)
        test_model(model, test_loader, training_criterion)
Beispiel #6
0
train_imageData = mnist_data.train.images.reshape(
    mnist_data.train.images.shape[0], 28, 28)

print("start expand data")
train_imageData = utils.BatchImageExpand(train_imageData[:SampleSize], 224)
print("expand data done")
train_labels = mnist_data.train.labels[:SampleSize]
train_imageData = train_imageData.reshape(-1, 224, 224, 1)

dataset = utils.DataSet(train_imageData,
                        train_labels,
                        testRate=0.05,
                        batchSize=batchSize)
learnCurve = utils.learnCurve()
vgg = model.VGG11()

#data = dataset.getNextBatch()

with tf.Session() as sess:
    writer = tf.summary.FileWriter("name_scope", sess.graph)
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    for ep in range(1, 11):
        data = dataset.getNextBatch()
        while data != None:
            # for i, d in enumerate(data[0]):
            #     utils.plotDigit(d.reshape(224,224))
            #     print(np.argmax(data[1][i]))
            # print(data[1])
Beispiel #7
0
def main():
    args = parser.parse_args()
    # Sets a certain seed for generating random numbers
    seed = 314
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    batch_size = 256 / args.num_nodes  # batch for one node

    # distributed model setup
    setup(args)
    model = mdl.VGG11()
    print("Setup Finished")

    # Distributed Data Parallel Training using Built in Module with cpu
    # DDP uses collective communications in the torch.distributed package to synchronize
    # gradients and buffers transparently. That is, Gradient synchronization communications
    # take place during the backward pass and overlap with the backward computation
    ddp_model = DDP(model)
    ddp_model.to(device)
    training_criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = optim.SGD(ddp_model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=0.0001)

    # data loading and preprocess
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])
    transform_test = transforms.Compose([transforms.ToTensor(), normalize])
    training_set = datasets.CIFAR10(root="./data",
                                    train=True,
                                    download=True,
                                    transform=transform_train)
    print("Data Loaded")

    #Sampler that restricts data loading to a subset of the dataset.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        training_set)

    train_loader = torch.utils.data.DataLoader(training_set,
                                               num_workers=args.num_nodes,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               shuffle=(train_sampler is None),
                                               pin_memory=True)
    test_set = datasets.CIFAR10(root="./data",
                                train=False,
                                download=True,
                                transform=transform_test)

    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=args.num_nodes,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              pin_memory=True)

    #  ensure shuffling work properly across multiple epochs
    train_sampler.set_epoch(Epoch)

    print("Training Started")
    start = time.time()
    for epoch in range(Epoch):
        e_s = time.time()
        train_model(ddp_model, train_loader, optimizer, training_criterion,
                    epoch)
        e_d = time.time()
        print("Duration of Epoch " + str(epoch) + " :", e_d - start)
        test_model(ddp_model, test_loader, training_criterion)
    print("Total Duration" + str(epoch) + " :", e_d - start)
    cleanup()