Ejemplo n.º 1
0
def main():
    args = parse_args()
    grad_sizes = model_grad_sizes[args.model]

    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    schedule = {
        3: 2,
        6: 3,
        9: 4,
        12: 1,
    }

    kfops.init(args.device)

    all_reduce = kfops.KungFuAllReduce()
    all_reduce_max = kfops.KungFuAllReduce(op=ReduceOp.MAX)
    resize = kfops.KungFuResize()

    xs = [
        ms.Tensor(np.array([1.0] * size).astype(np.float32))
        for size in grad_sizes
    ]

    step = 0
    need_sync = True
    while True:
        if need_sync:
            step = sync_step(step, all_reduce_max)
            print('step: %d' % (step))
            need_sync = False
        t0 = time.time()
        ys = [all_reduce(x) for x in xs]
        t1 = time.time()
        d = t1 - t0

        if step in schedule:
            new_size = ms.Tensor(schedule[step], dtype=ms.uint32)
            print('step=%d, will resize to %d' % (step, schedule[step]))
            changed, detached = resize(new_size)
            print('changed %s, detached: %s' % (changed, detached))
            if changed:
                need_sync = True
            if detached:
                break

        step += 1
        if step > args.steps:
            break
    print('train loop finished')
    kfops.finalize(args.device)
Ejemplo n.º 2
0
def main():
    args = parse_args()
    grad_sizes = model_grad_sizes[args.model]

    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    if args.collective == 'mindspore':
        init()
        cluster_size = get_group_size()
        rank = get_rank()
    else:
        print('using kungfu collective')
        kfops.init(args.device)
        cluster_size = parse_kungfu_size()
        rank = parse_kungfu_port() - 10000

    print('rank: %d, size: %d' % (rank, cluster_size))

    if args.collective == 'mindspore':
        all_reduce = ms.ops.operations.AllReduce()
    elif args.collective == 'kungfu':
        all_reduce = kfops.KungFuAllReduce()
    else:
        raise RuntimeError('invalid collective')

    xs = [
        ms.Tensor(np.array([1.0] * size).astype(np.float32))
        for size in grad_sizes
    ]

    data_size = sum(grad_sizes) * 4  # 1 float is 4 bytes
    multiplier = 4 * (cluster_size - 1)
    Gi = 1024 * 1024 * 1024

    def run_stage(name, steps):
        for i in range(steps):
            t0 = time.time()
            ys = [all_reduce(x) for x in xs]
            t1 = time.time()
            d = t1 - t0
            rate = float(data_size) * multiplier / Gi / d
            if rank == 0:
                print('%s %d took %.3fms, data rate: %.3fGiB/s' %
                      (name, i + 1, d * 1e3, rate))

    run_stage('warmup', args.warmup_steps)
    run_stage('step', args.steps)

    if args.collective == 'kungfu':
        kfops.finalize(args.device)
Ejemplo n.º 3
0
def main():
    args = parse_args()
    log_args(args)
    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device,
                           save_graphs=False)

    kfops.init(args.device)

    all_reduce = kfops.KungFuAllReduce()

    x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32))
    print(x)
    y = all_reduce(x)
    print(y)

    kfops.finalize(args.device)
Ejemplo n.º 4
0
            init()
        # GPU target
        else:
            init()
            context.set_auto_parallel_context(
                device_num=get_group_size(),
                parallel_mode=ParallelMode.DATA_PARALLEL,
                gradients_mean=True)
            if args_opt.net == "resnet50":
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[85, 160])
        ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
            get_rank()) + "/"

    if args_opt.run_kungfu:
        kfops.init(args_opt.device_target)
        rank = kfops.kungfu_current_rank()
        size = kfops.kungfu_current_cluster_size()
        print('kungfu rank=%d, size=%d' % (rank, size))
        if args_opt.elastic:
            version = os.getenv('KUNGFU_INIT_CLUSTER_VERSION')
            ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
                rank) + '@' + version + "/"
        else:
            ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
                rank) + "/"

    # create dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             repeat_num=100,