def test_popdist_train(self): gc.collect() executable = get_current_interpreter_executable() out = run_script( "poprun", f"--mpi-global-args='--allow-run-as-root' --num-instances=2 --numa-aware=yes --num-replicas=2 --ipus-per-replica 1 {executable} train/train.py --data cifar10 --model resnet18 --epoch 2 " "--precision 16.16 --optimizer sgd_combined --lr 0.1 --batch-size 2 --gradient-accumulation 16 --enable-stochastic-rounding --validation-mode after --dataloader-worker 4 " "--norm-type group --norm-num-groups 32 --checkpoint-path restore_test_path_test_validation_distributed", python=False) train_acc = get_train_accuracy(out) assert train_acc > 15.0, "training accuracy not improved" test_acc = get_test_accuracy(out) assert test_acc > 15.0, "validation accuracy not improved" # Check the validation accuracy from a single instance out = run_script( "train/validate.py", "--checkpoint-path restore_test_path_test_validation_distributed/resnet18_cifar10_2.pt" ) restored_test_acc = get_test_accuracy(out) assert abs( restored_test_acc - test_acc ) < 0.01, "distributed and single instance validation accuracies doesn't match" # remove folder parent_dir = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) shutil.rmtree( os.path.join(parent_dir, "restore_test_path_test_validation_distributed"))
def test_weight_avg(self): gc.collect() parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) out1 = run_script("train/train.py", "--data cifar10 --epoch 3 --model resnet18 --precision 16.16 --weight-avg-strategy mean --norm-type group " "--norm-num-groups 32 --optimizer sgd_combined --lr 0.1 --batch-size 2 --gradient-accumulation 32 --checkpoint-path restore_test_path_weight_avg " "--weight-avg-N 2 --dataloader-worker 4 --seed 0") os.remove(os.path.join(parent_dir, "restore_test_path_weight_avg", "resnet18_cifar10_3_averaged.pt")) _ = run_script("train/weight_avg.py", "--checkpoint-path restore_test_path_weight_avg --weight-avg-strategy mean --weight-avg-N 2") out2 = run_script("train/validate.py", "--checkpoint-path restore_test_path_weight_avg/resnet18_cifar10_3_averaged.pt") acc1 = get_test_accuracy(out1) acc2 = get_test_accuracy(out1) assert acc1 > 15 assert acc1 == acc2 shutil.rmtree(os.path.join(parent_dir, "restore_test_path_weight_avg"))
def test_validation(self): gc.collect() # create a model out = run_script("train/train.py", "--data cifar10 --epoch 1 --model resnet18 --precision 16.16 --optimizer sgd_combined --lr 0.1 --batch-size 2 --gradient-accumulation 32 --seed 0 " "--norm-type group --norm-num-groups 32 --checkpoint-path restore_test_path_test_validation --dataloader-worker 4") saved_test_acc = get_test_accuracy(out) # validate the model out = run_script("train/validate.py", "--checkpoint-path restore_test_path_test_validation/resnet18_cifar10_1.pt") acc = get_test_accuracy(out) # close enough assert abs(saved_test_acc - acc) < 0.01 # remove folder parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) shutil.rmtree(os.path.join(parent_dir, "restore_test_path_test_validation"))
def test(inputs, labels, sent_size, model_name): model = net.Lstm_Net(sent_size) model.load_state_dict(torch.load(Config.test_model_name)) model.eval() labels_hat = [] inputs = utils.generateInputs(inputs) labels_hat = model(inputs) accuracy = utils.get_test_accuracy(labels_hat, labels) return accuracy
def test_half_resolution_training(self): gc.collect() out = run_script("train/train.py", "--data cifar10 --model resnet18 --epoch 1 --precision 16.32 --optimizer sgd_combined --lr 0.1 --batch-size 2 --gradient-accumulation 32 " "--norm-type batch --dataloader-worker 4 --half-res-training --fine-tune-epoch 1 --fine-tune-first-trainable-layer layer3 --weight-avg-strategy exponential " "--weight-avg-exp-decay 0.97 --checkpoint-path test_half_resolution_training --seed 0") acc = get_test_accuracy(out) assert acc > 15.0 # remove folder parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) shutil.rmtree(os.path.join(parent_dir, "test_half_resolution_training"))
def test_single_ipu_mobilenet_v3_large_validation_batchnorm(self): gc.collect() out = run_script("train/train.py", "--data cifar10 --model mobilenet-v3-large --epoch 3 --precision 16.32 --optimizer sgd_combined --lr 0.1 --batch-size 2 --gradient-accumulation 32 " "--norm-type batch --enable-stochastic-rounding --dataloader-worker 4 --seed 0") acc = get_test_accuracy(out) assert acc > 15.0
def test_single_ipu_validation_groupnorm(self): gc.collect() out = run_script("train/train.py", "--data cifar10 --model resnet18 --epoch 3 --precision 16.16 --optimizer sgd_combined --lr 0.1 --batch-size 2 --gradient-accumulation 32 " "--norm-type group --norm-num-groups 32 --enable-stochastic-rounding --dataloader-worker 4 --seed 0") acc = get_test_accuracy(out) assert acc > 15.0
def train_and_test(gans, trial, num_real, num_synth, fin, num_epoch=200): name = "./classifier_results/trial{}/{}-{}".format(trial, num_real, num_synth) data_loader = mix(gans, num_real, num_synth) c = classifier.SimpleClassifier() # select from {SimpleClassifier, DeepClassifier} c.train(data_loader, name, num_epoch=num_epoch) fin.write("Model: {}; Accuracy: {}\n".format(name, utils.get_test_accuracy(c)))