Esempio n. 1
0
def _run_in_clean_lab(remote_url: str, add_in_name: str, full_hash: str,
                      cuda_id: int) -> None:
    """
    note: using `git --git-dir tmp_repo_dir` in order not to change global working directory. (Alternatively subprocess
          might be used, but concurrent.futures.ProcessPoolExecutor apparently shares the working directory across
          processes)

    :param remote_url: of git repo
    :param full_hash: 'git commit hash to be run from'
    """
    with TemporaryDirectory() as lab:
        os.chdir(lab)
        logger.debug("running in clean lab: %s", lab)
        try:
            repo_name = f"summer_{full_hash}"
            pbs3.git.clone("--recurse-submodules", "-j8", remote_url,
                           repo_name)
            os.chdir(repo_name)
            pbs3.git("fetch", "origin", full_hash)
            pbs3.git("checkout", "--force", full_hash)
            pbs3.git("submodule", "update", "--recursive")

            test_env = os.environ.copy()
            test_env[PYTHON_PATH_NAME] = os.path.join(
                lab, repo_name) + os.pathsep + test_env.get(
                    PYTHON_PATH_NAME, "")
            test_env[CUDA_VISIBLE_DEVICES_NAME] = str(cuda_id)

            cmd = [
                "python", "-c", f'import summer;summer.run("{add_in_name}")'
            ]
            out = subprocess.run(cmd,
                                 env=test_env,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 start_new_session=True)
            if out.returncode:
                tag = f"{FAILED_ON}{datetime.now().strftime('%y-%m-%d_%H-%M')}"
            else:
                tag = f"{SUCCEEDED_ON}{datetime.now().strftime('%y-%m-%d_%H-%M')}"

            msg = out.stdout.decode("utf-8").replace("'", '"')
            pbs3.git("tag", "-a", tag, "-m", f"'{msg}'", full_hash)
            pbs3.git("push", "origin", tag)
            logger.debug("pushed tag '%s' to '%s' at %s", tag, "origin",
                         full_hash)
        except Exception as e:
            logger.exception(e)
            raise e
Esempio n. 2
0
    def __init__(self):
        self.log_config = LogConfig()
        assert self.log_config.log_scalars_every[1] in (
            "iterations", "epochs"), self.log_config.log_scalars_every[1]
        assert self.log_config.log_images_every[1] in (
            "iterations", "epochs"), self.log_config.log_images_every[1]

        self.commit_hash = pbs3.git("rev-parse", "--verify", "HEAD").stdout
        self.commit_subject = pbs3.git.log("-1",
                                           "--pretty=%B").stdout.split("\n")[0]
        if self.add_in_name is None:
            self.add_in_name = (pbs3.git("rev-parse", "--abbrev-ref",
                                         "HEAD").stdout.strip().replace(
                                             "'", "").replace('"', ""))
        if self.valid_dataset == self.test_dataset and self.max_validation_samples >= len(
                self.test_dataset):
            raise ValueError("no samples for testing left")
Esempio n. 3
0
    parser.add_argument(
        "--rerun-failed",
        action="store_true",
        help="wether or not to rerun previously failed experiments")
    parser.add_argument(
        "--rerun-succeeded",
        action="store_true",
        help="whether or not to rerun previously succeeded experiments")
    args = parser.parse_args()

    cuda_devices = args.cuda
    if cuda_devices is None:
        cuda_devices = os.environ.get(CUDA_VISIBLE_DEVICES_NAME, None)

    if cuda_devices is None:
        raise Exception(f"{CUDA_VISIBLE_DEVICES_NAME} not set")

    commit_hash_cmd = pbs3.git("rev-parse", "--verify", args.start)
    current_commit_hash = commit_hash_cmd.stdout

    remote_branches = [
        bn if "/" in bn else "origin/" + bn for bn in args.remote_branch
    ]
    experimenter(
        start_commit=current_commit_hash.strip(),
        experiment_identifier=args.exp,
        remote_branches=remote_branches,
        rerun_failed=args.rerun_failed,
        rerun_succeeded=args.rerun_succeeded,
    )