Example #1
0
def make_data_loader(args, mode, is_consistent=False, synthetic=False):
    assert mode in ("train", "validation")

    if mode == "train":
        total_batch_size = args.batch_size * flow.env.get_world_size()
        batch_size = args.batch_size
        num_samples = args.num_image
    else:
        total_batch_size = args.val_global_batch_size
        batch_size = args.val_batch_size
        num_samples = args.val_samples_per_epoch

    placement = None
    sbp = None

    if is_consistent:
        placement = flow.env.all_device_placement("cpu")
        sbp = flow.sbp.split(0)
        batch_size = total_batch_size

    if synthetic:

        data_loader = SyntheticDataLoader(
            batch_size=batch_size,
            num_classes=args.num_classes,
            placement=placement,
            sbp=sbp,
        )
        return data_loader.to("cuda")

    ofrecord_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode=mode,
        dataset_size=num_samples,
        batch_size=batch_size,
        total_batch_size=total_batch_size,
        data_part_num=args.ofrecord_part_num,
        placement=placement,
        sbp=sbp,
    )
    return ofrecord_data_loader
Example #2
0
def main(args):

    train_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="train",
        dataset_size=9469,
        batch_size=args.train_batch_size,
    )

    val_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="val",
        dataset_size=3925,
        batch_size=args.val_batch_size,
    )

    # oneflow init
    start_t = time.time()
    mobilenetv2_module = mobilenet_v2()
    if args.load_checkpoint != "":
        print("load_checkpoint >>>>>>>>> ", args.load_checkpoint)
        mobilenetv2_module.load_state_dict(flow.load(args.load_checkpoint))

    end_t = time.time()
    print("init time : {}".format(end_t - start_t))

    of_cross_entropy = flow.nn.CrossEntropyLoss()

    mobilenetv2_module.to("cuda")
    of_cross_entropy.to("cuda")

    of_sgd = flow.optim.SGD(mobilenetv2_module.parameters(),
                            lr=args.learning_rate,
                            momentum=args.mom)

    of_losses = []
    all_samples = len(val_data_loader) * args.val_batch_size
    print_interval = 20

    for epoch in range(args.epochs):
        mobilenetv2_module.train()

        for b in range(len(train_data_loader)):
            image, label = train_data_loader.get_batch()

            # oneflow train
            start_t = time.time()
            image = image.to("cuda")
            label = label.to("cuda")
            logits = mobilenetv2_module(image)
            loss = of_cross_entropy(logits, label)
            loss.backward()
            of_sgd.step()
            of_sgd.zero_grad()
            end_t = time.time()
            if b % print_interval == 0:
                l = loss.numpy()
                of_losses.append(l)
                print(
                    "epoch {} train iter {} oneflow loss {}, train time : {}".
                    format(epoch, b, l, end_t - start_t))

        print("epoch %d train done, start validation" % epoch)

        mobilenetv2_module.eval()
        correct_of = 0.0
        for b in range(len(val_data_loader)):
            image, label = val_data_loader.get_batch()

            start_t = time.time()
            image = image.to("cuda")
            with flow.no_grad():
                logits = mobilenetv2_module(image)
                predictions = logits.softmax()
            of_predictions = predictions.numpy()
            clsidxs = np.argmax(of_predictions, axis=1)

            label_nd = label.numpy()
            for i in range(args.val_batch_size):
                if clsidxs[i] == label_nd[i]:
                    correct_of += 1
            end_t = time.time()

        print("epoch %d, oneflow top1 val acc: %f" %
              (epoch, correct_of / all_samples))

        flow.save(
            mobilenetv2_module.state_dict(),
            os.path.join(
                args.save_checkpoint_path,
                "epoch_%d_val_acc_%f" % (epoch, correct_of / all_samples),
            ),
        )

    writer = open("of_losses.txt", "w")
    for o in of_losses:
        writer.write("%f\n" % o)
    writer.close()
Example #3
0
def setup(args):
    train_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="train",
        dataset_size=9469,
        batch_size=args.train_batch_size,
    )

    val_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="val",
        dataset_size=3925,
        batch_size=args.val_batch_size,
    )

    criterion = flow.nn.CrossEntropyLoss()

    # model setup
    eager_model = resnet50()
    graph_model = resnet50()
    graph_model.load_state_dict(eager_model.state_dict())

    eager_model.to("cuda")
    graph_model.to("cuda")
    # optimizer setup
    eager_optimizer = flow.optim.SGD(eager_model.parameters(),
                                     lr=args.learning_rate,
                                     momentum=args.mom)
    graph_optimizer = flow.optim.SGD(graph_model.parameters(),
                                     lr=args.learning_rate,
                                     momentum=args.mom)

    # criterion setup
    criterion = flow.nn.CrossEntropyLoss()
    criterion = criterion.to("cuda")

    class ModelTrainGraph(flow.nn.Graph):
        def __init__(self):
            super().__init__()
            self.graph_model = graph_model
            self.criterion = criterion
            self.add_optimizer(graph_optimizer)

        def build(self, image, label):
            logits = self.graph_model(image)
            loss = self.criterion(logits, label)
            loss.backward()
            return loss

    class ModelEvalGraph(flow.nn.Graph):
        def __init__(self):
            super().__init__()
            self.graph_model = graph_model

        def build(self, image):
            with flow.no_grad():
                logits = self.graph_model(image)
                predictions = logits.softmax()
            return predictions

    model_train_graph = ModelTrainGraph()
    model_eval_graph = ModelEvalGraph()

    dic = {
        "train_dataloader": train_data_loader,
        "val_dataloader": val_data_loader,
        "eager": [eager_model, eager_optimizer, criterion],
        "graph": [graph_model, model_train_graph, model_eval_graph],
    }

    return dic
Example #4
0
def main(args):
    # Data Setup
    train_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="train",
        dataset_size=9469,
        batch_size=args.train_batch_size,
    )

    val_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="val",
        dataset_size=3925,
        batch_size=args.val_batch_size,
    )

    # Model Setup
    print("***** Initialization *****")
    start_t = time.time()
    model = build_model(args)
    if args.load_checkpoint != "":
        print("load_checkpoint >>>>>>>>> ", args.load_checkpoint)
        model.load_state_dict(flow.load(args.load_checkpoint))
    end_t = time.time()
    print("init time : {}".format(end_t - start_t))

    # Training Setup
    criterion = flow.nn.CrossEntropyLoss()
    model.to("cuda")
    criterion.to("cuda")
    optimizer = flow.optim.SGD(
        model.parameters(),
        lr=args.learning_rate,
        momentum=args.mom,
        weight_decay=args.weight_decay,
    )
    lr_scheduler = flow.optim.lr_scheduler.StepLR(optimizer,
                                                  step_size=30,
                                                  gamma=0.1)

    loss_list = []
    accuracy_list = []
    best_acc = 0.0
    for epoch in range(args.epochs):
        print("***** Runing Training *****")
        train_loss = train_one_epoch(args, model, criterion, train_data_loader,
                                     optimizer, epoch, lr_scheduler)
        print("***** Run Validation *****")
        accuracy = valid(args, model, criterion, val_data_loader)

        # save model after each epoch
        print("***** Save Checkpoint *****")
        save_path = os.path.join(args.save_checkpoint_path,
                                 "epoch_%d_val_acc_%f" % (epoch, accuracy))
        save_checkpoint(model, save_path)
        print("Save checkpoint to: ", save_path)

        # save best model
        if best_acc < accuracy:
            save_path = os.path.join(args.save_checkpoint_path, "best_model")
            if os.path.exists(save_path):
                shutil.rmtree(save_path, True)
            save_checkpoint(model, save_path)
            best_acc = accuracy

        loss_list.append(train_loss)
        accuracy_list.append(accuracy)
    print("End Training!")
    print("Max Accuracy: ", best_acc)

    # saving training information
    print("***** Save Logs *****")
    save_logs(loss_list, "eager/losses.txt")
    print("Save loss info to: ", "eager/losses.txt")
    save_logs(accuracy_list, "eager/accuracy.txt")
    print("Save acc info to: ", "eager/accuracy.txt")
Example #5
0
def main(args):
    # path setup
    training_results_path = os.path.join(args.results, args.tag)
    os.makedirs(training_results_path, exist_ok=True)

    # build dataloader
    train_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="train",
        dataset_size=9469,
        batch_size=args.train_batch_size,
        image_size=args.image_size,
    )

    val_data_loader = OFRecordDataLoader(
        ofrecord_root=args.ofrecord_path,
        mode="val",
        dataset_size=3925,
        batch_size=args.val_batch_size,
        image_size=args.image_size,
    )

    # oneflow init
    start_t = time.time()
    model = build_model(args)
    if args.load_checkpoint != "":
        print("load_checkpoint >>>>>>>>> ", args.load_checkpoint)
        model.load_state_dict(flow.load(args.load_checkpoint))

    end_t = time.time()
    print("init time : {}".format(end_t - start_t))

    of_cross_entropy = flow.nn.CrossEntropyLoss()

    model.to("cuda")
    of_cross_entropy.to("cuda")

    of_sgd = flow.optim.SGD(
        model.parameters(), lr=args.learning_rate, momentum=args.mom
    )

    class ViTNetGraph(flow.nn.Graph):
        def __init__(self):
            super().__init__()
            self.model = model
            self.cross_entropy = of_cross_entropy
            self.add_optimizer(of_sgd)
            self.train_data_loader = train_data_loader

        def build(self):
            image, label = self.train_data_loader()
            image = image.to("cuda")
            label = label.to("cuda")
            logits = self.model(image)
            loss = self.cross_entropy(logits, label)
            loss.backward()
            return loss

    vit_graph = ViTNetGraph()

    class ViTEvalGraph(flow.nn.Graph):
        def __init__(self):
            super().__init__()
            self.model = model
            self.val_data_loader = val_data_loader

        def build(self):
            image, label = self.val_data_loader()
            image = image.to("cuda")
            with flow.no_grad():
                logits = self.model(image)
                predictions = logits.softmax()
            return predictions, label

    vit_eval_graph = ViTEvalGraph()

    of_losses = []
    of_accuracy = []
    all_samples = len(val_data_loader) * args.val_batch_size
    print_interval = 20

    for epoch in range(args.epochs):
        model.train()

        for b in range(len(train_data_loader)):
            # oneflow graph train
            start_t = time.time()
            loss = vit_graph()
            end_t = time.time()
            if b % print_interval == 0:
                l = loss.numpy()
                of_losses.append(l)
                print(
                    "epoch {} train iter {} oneflow loss {}, train time : {}".format(
                        epoch, b, l, end_t - start_t
                    )
                )

        print("epoch %d train done, start validation" % epoch)

        model.eval()
        correct_of = 0.0
        for b in range(len(val_data_loader)):
            start_t = time.time()
            predictions, label = vit_eval_graph()
            of_predictions = predictions.numpy()
            clsidxs = np.argmax(of_predictions, axis=1)

            label_nd = label.numpy()
            for i in range(args.val_batch_size):
                if clsidxs[i] == label_nd[i]:
                    correct_of += 1
            end_t = time.time()

        top1 = correct_of / all_samples
        of_accuracy.append(top1)
        print("epoch %d, oneflow top1 val acc: %f" % (epoch, top1))

        flow.save(
            model.state_dict(),
            os.path.join(
                args.save_checkpoint_path,
                "epoch_%d_val_acc_%f" % (epoch, correct_of / all_samples),
            ),
        )

    writer = open("graph/losses.txt", "w")
    for o in of_losses:
        writer.write("%f\n" % o)
    writer.close()

    writer = open("graph/accuracy.txt", "w")
    for o in of_accuracy:
        writer.write("%f\n" % o)
    writer.close()