Ejemplo n.º 1
0
            learning_rate = 1.25
            momentum = 0
        job_id = register_job(
            user="******",
            project="sgd",
            experiment=experiment,
            job=name,
            n_workers=n_workers,
            priority=20,
            config_overrides={
                "seed": seed,
                "distributed_backend": "nccl",
                "optimizer_scale_lr_with_factor": n_workers,
                "log_verbosity": 1,
                **shared.language_modeling_base(),
                **shared.sgd_config(learning_rate, momentum=momentum, weight_decay=0.0),
                **shared.optimizer_config(reducer),
            },
            runtime_environment={"clone": {"code_package": code_package}, "script": "train.py"},
            annotations={"description": description},
        )
        print("{} - {}".format(job_id, name))
        registered_ids.append(job_id)


# kubernetes_schedule_job_queue(
#     registered_ids,
#     "ic-registry.epfl.ch/mlo/vogels_experiment",
#     volumes=["pv-mlodata1"],
#     gpus=gpus,
#     parallelism=4,
Ejemplo n.º 2
0
     job=name,
     n_workers=n_workers,
     priority=10,
     config_overrides={
         "seed":
         seed,
         "distributed_backend":
         "nccl",
         "optimizer_scale_lr_with_factor":
         n_workers,
         "num_epochs":
         300,
         "log_verbosity":
         1,
         **shared.sgd_config(learning_rate,
                             momentum=0.9,
                             weight_decay=0.0001),
         **shared.optimizer_config(reducer),
         "optimizer_reducer_compression":
         1 / compression,
     },
     runtime_environment={
         "clone": {
             "code_package": code_package
         },
         "script": "train.py"
     },
     annotations={"description": description},
 )
 print("{} - {}".format(job_id, name))
 registered_ids.append(job_id)
Ejemplo n.º 3
0
                        job=name,
                        priority=seed + (100 if backend == "nccl" else 0),
                        n_workers=n_workers,
                        config_overrides={
                            "seed":
                            10000 + seed,
                            "optimizer_scale_lr_with_factor":
                            n_workers,
                            "distributed_backend":
                            backend,
                            "log_verbosity":
                            log_level,
                            "num_epochs":
                            10,
                            **shared.sgd_config(0.1,
                                                momentum=0.9,
                                                weight_decay=0.0001),
                            **shared.optimizer_config(reducer),
                        },
                        runtime_environment={
                            "clone": {
                                "code_package": code_package
                            },
                            "script": "train.py",
                        },
                        annotations={"description": description},
                    )
                    print("{} - {}".format(job_id, name))
                    registered_ids.append(job_id)

# kubernetes_schedule_job_queue(