Exemple #1
0
        for batch_idx, data in enumerate(train_loader):
            feature = data[:-1]
            target = data[-1]
            optimizer.zero_grad()
            output = model(*feature)
            loss = F.smooth_l1_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} \tLoss: {:.6f}'.format(
                    epoch, loss.item()))

    for epoch in range(1, args.epochs + 1):
        train(epoch)


if __name__ == '__main__':
    # connect to ray cluster
    import ray
    # ray.init(address='auto')
    ray.init()
    torch_ds, num_features = process_data()
    # Start horovod workers on Ray
    from horovod.ray import RayExecutor
    settings = RayExecutor.create_settings(500)
    executor = RayExecutor(settings, num_hosts=1, num_slots=1, cpus_per_slot=1)
    executor.start()
    executor.run(train_fn, args=[torch_ds, num_features])
    raydp.stop_spark()
    ray.shutdown()
 def stop(self):
     self.spark.stop()
     raydp.stop_spark()
Exemple #3
0
 def stop_all():
     raydp.stop_spark()
     ray.shutdown()
Exemple #4
0
def test_spark(ray_cluster):
    spark = raydp.init_spark("test", 1, 1, "500 M")
    result = spark.range(0, 10).count()
    assert result == 10
    raydp.stop_spark()