for batch_idx, data in enumerate(train_loader): feature = data[:-1] target = data[-1] optimizer.zero_grad() output = model(*feature) loss = F.smooth_l1_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} \tLoss: {:.6f}'.format( epoch, loss.item())) for epoch in range(1, args.epochs + 1): train(epoch) if __name__ == '__main__': # connect to ray cluster import ray # ray.init(address='auto') ray.init() torch_ds, num_features = process_data() # Start horovod workers on Ray from horovod.ray import RayExecutor settings = RayExecutor.create_settings(500) executor = RayExecutor(settings, num_hosts=1, num_slots=1, cpus_per_slot=1) executor.start() executor.run(train_fn, args=[torch_ds, num_features]) raydp.stop_spark() ray.shutdown()
def stop(self): self.spark.stop() raydp.stop_spark()
def stop_all(): raydp.stop_spark() ray.shutdown()
def test_spark(ray_cluster): spark = raydp.init_spark("test", 1, 1, "500 M") result = spark.range(0, 10).count() assert result == 10 raydp.stop_spark()