Example #1
0
 class Args(ta.TypedArgs):
     foo: int = ta.add_argument('--foo', type=int, default=42)
     bar: List[int] = ta.add_argument(nargs='*', default=[1, 2, 3])
     config: List[str] = ta.add_argument('--config',
                                         default=[],
                                         type=str,
                                         action='append')
Example #2
0
 class Args5(TypedArgs):
     types: List[Union[str, int]] = (add_argument('--str',
                                                  action='append_const',
                                                  const=str),
                                     add_argument('--int',
                                                  action='append_const',
                                                  const=int))
Example #3
0
class Args(ta.TypedArgs):
    foo: str = 'bar'
    data: str = ta.add_argument(metavar='DIR', help='path to dataset')
    arch: str = ta.add_argument('-a',
                                '--arch',
                                metavar='ARCH',
                                default='resnet18',
                                help='model architecture (default: resnet18)')
    num_workers: int = ta.add_argument(
        '-j',
        '--workers',
        default=4,
        metavar='N',
        help='number of data loading workers (default: 4)')
Example #4
0
class Args(BaseArgs):
    load_checkpoint: Optional[Path] = add_argument(
        '--load-checkpoint',
        required=False,
        help='path to the checkpoint file to be loaded')
    load_model: Optional[Path] = add_argument(
        '--load-model',
        required=False,
        help='path to the checkpoint file to be loaded, but only load model.')
    validate: bool = add_argument('--validate',
                                  action='store_true',
                                  help='Only run final validate then exit')
    moco_checkpoint: Optional[str] = add_argument('--mc',
                                                  '--moco-checkpoint',
                                                  help='load moco checkpoint')
    seed: Optional[int] = add_argument('--seed', help='random seed')
    world_size: int = add_argument('--ws',
                                   '--world-size',
                                   default=torch.cuda.device_count(),
                                   help='total processes')
    _continue: bool = add_argument(
        '--continue',
        action='store_true',
        help='Use previous config and checkpoint',
    )
    no_scale_lr: bool = add_argument(
        '--no-scale-lr',
        action='store_true',
        help='Do not change lr according to batch size')

    def resolve_continue(self):
        if not self._continue:
            return
        if not self.experiment_dir.exists():
            raise EnvironmentError(
                f'Experiment directory "{self.experiment_dir}" does not exists.'
            )

        if self.config is None:
            run_id = -1
            for run in self.experiment_dir.iterdir():
                match = self.RUN_DIR_NAME_REGEX.match(run.name)
                if match is not None:
                    this_run_id = int(match.group(1))
                    if this_run_id > run_id and run.is_dir():
                        this_config_path = run / 'config.json'
                        if this_config_path.exists():
                            run_id = this_run_id
                            self.config = this_config_path
            if self.config is None:
                raise EnvironmentError(f'No previous run config found')
            logger.info('Continue using previous config: "%s"', self.config)
        if self.load_checkpoint is None:
            checkpoint_path = self.experiment_dir / 'checkpoint.pth.tar'
            if checkpoint_path.exists():
                self.load_checkpoint = checkpoint_path
                logger.info('Continue using previous checkpoint: "%s"',
                            self.load_checkpoint)
            else:
                logger.warning('No previous checkpoint found')
Example #5
0
class Args(TypedArgs):
    foo: str = 'bar'
    data: str = add_argument(metavar='DIR', help='path to dataset')
    arch: str = add_argument('-a',
                             '--arch',
                             metavar='ARCH',
                             default='resnet18',
                             help='model architecture (default: resnet18)')
    num_workers: int = add_argument(
        '-j',
        '--workers',
        default=4,
        metavar='N',
        help='number of data loading workers (default: 4)')

    def parser_factory(self):
        return argparse.ArgumentParser('PROG')
Example #6
0
class BaseArgs(Base):

    device_id: int = ta.add_argument(
        '--device-id',
        type=int,
        default=DEFAULT_DEVICE_ID,  # -1 for cpu
    )

    @property
    def device(self) -> flow.device:
        # if self.device_id < 0:
        #     return flow.device('cpu')
        # else:
        #     return flow.device('cuda:{}'.format(self.device_id))
        return flow.device('cuda:{}'.format(LOCAL_RANK))

    def is_cuda_available(self) -> bool:
        return self.device_id > -1
Example #7
0
 class Args(TypedArgs):
     foo: str = add_argument('--foo')
     command: str = add_argument()
     args: List[str] = add_argument(nargs=argparse.REMAINDER)
Example #8
0
 class Args(TypedArgs):
     foo: List[str] = add_argument(nargs='+')
Example #9
0
 class Args(TypedArgs):
     foo: List[str] = add_argument('--foo', nargs='*')
     bar: List[str] = add_argument('--bar', nargs='*')
     baz: List[str] = add_argument(nargs='*')
Example #10
0
 class Args(TypedArgs):
     foo: Optional[str] = add_argument('-f', '--foo')
     bar: str = add_argument()
Example #11
0
 class Args(TypedArgs):
     foo: str = add_argument('--foo', nargs='?', const='c', default='d')
     bar: str = add_argument(nargs='?', default='d')
Example #12
0
class BaseArgs(Base):

    rank_start: int = ta.add_argument(
        '--rank-start', type=int, default=0,
        help=''
    )

    _dist_backend: str = ta.add_argument(
        "--dist-backend", type=str, choices=["nccl", "gloo"], default=None
    )
    world_size: int = ta.add_argument(
        '--world-size', type=int, default=1,
    )
    dist_url: Optional[str] = ta.add_argument(
        "--dist-url", type=str,
    )

    @property
    def device(self) -> torch.device:
        if self.gpu and torch.cuda.is_available():
            return torch.device('cuda')
        else:
            return torch.device('cpu')

    @property
    def dist_backend(self) -> str:
        backend = self._dist_backend
        if backend is None:
            if self.gpu and dist.is_nccl_available():
                backend = "nccl"
            else:
                backend = "gloo"
            _logger.info("infer dist_backend: %s", backend)

        return backend

    def init_process_group_from_file(self, local_rank: int) -> int:
        rank = self.rank_start + local_rank

        init_process_group_from_file(
            self.dist_backend,
            self.experiment_dir / 'dist_init',
            world_size=self.world_size,
            rank=rank,
        )

        return rank

    def init_process_group_from_tcp(self, local_rank: int) -> int:
        assert self.dist_url

        rank = self.rank_start + local_rank

        dist.init_process_group(
            self.dist_backend,
            init_method=self.dist_url,
            world_size=self.world_size,
            rank=rank,
        )

        return rank

    def try_cuda_set_device(self, local_rank: int):
        if self.gpu:
            device_id = self.gpu[local_rank]
            torch.cuda.set_device(device_id)
Example #13
0
class Args(TypedArgs):
    dlp: bool = add_argument('--dlp', action='store_true')
    tfds: bool = add_argument('--tfds', action='store_true')
    batch_size: int = add_argument('-b', '--batch-size', default=128)
    num_workers: int = add_argument('-n', '--num-workers', default=2)
Example #14
0
class BaseArgs(ta.TypedArgs):
    config_file: Optional[Path] = ta.add_argument(
        "-c", "--config", type=Path, help="config 文件"
    )

    experiment_dir: Path = ta.add_argument(
        "-e", "--experiment-dir", type=Path, default=Path("exps/000"), help="实验目录"
    )

    apply: List[str] = ta.add_argument(
        "-a",
        "--apply",
        type=str,
        action="append",
        default=[],
        help="额外 config,可 merge 到 main config",
    )

    print_freq: int = ta.add_argument(
        "--print-freq", "--pf", type=int, default=1000, help="显示 log 的频率,一般为10"
    )

    temp_dir: Path = ta.add_argument(
        "--temp-dir", type=Path, default=Path("temp"), help="临时目录,记得定期删"
    )
    debug: bool = ta.add_argument(
        "-d", "--debug", action="store_true", help="debug 模式")
    no_tqdm: bool = ta.add_argument(
        "--no-tqdm", action="store_true", help="关闭 tqdm")

    resume: Optional[Path] = ta.add_argument(
        "--resume", type=Path, help="resume checkpoint path"
    )

    force: bool = ta.add_argument(
        "-f", "--force", action="store_true", help="移除旧实验目录到 temp dir,强制创建新实验目录"
    )

    # 默认使用 cpu,后续可能加入 xla 支持
    gpu: List[int] = ta.add_argument(
        "--gpu", type=parse_gpu_list, default=[], help="指定gpu,`1,2,5-7 -> [1,2,5,6,7]`"
    )

    def try_make_experiment_dir(self):
        if self.experiment_dir.exists():
            if self.force:
                timestamp = datetime.now().strftime('%Y_%m_%d-%H.%M.%S')
                new_experiment_name = self.experiment_dir.name + '-' + timestamp
                new_experiment_dir = self.temp_dir / new_experiment_name
                print(
                    f"move old experiment dir from {self.experiment_dir} to {new_experiment_dir}"
                )
                # 确保 temp dir 存在
                self.temp_dir.mkdir(parents=True, exist_ok=True)
                self.experiment_dir.rename(new_experiment_dir)
            else:
                print(
                    f'实验目录 {self.experiment_dir} 已存在,可使用 -f/--force 参数覆盖实验目录')
                exit(0)

        self.experiment_dir.mkdir(parents=True, exist_ok=False)

    def config(self) -> dict:
        assert self.config_file, "请指定 config file"
        snippet = parse_config(self.config_file, self.apply)
        config = from_snippet(snippet)
        return config

    def save_config(self, config: Optional[dict] = None, name: str = "config.json"):
        if not config:
            config = self.config()
        dump_to_json(config, self.experiment_dir / name)

    def save_command(self, name: str = 'run.sh'):
        with open(self.experiment_dir / name, 'w') as f:
            f.write(f"cd {quote(os.getcwd())}\n")
            envs = ['CUDA_VISIBLE_DEVICES']
            for env in envs:
                value = os.environ.get(env, None)
                if value is not None:
                    f.write(f'export {env}={quote(value)}\n')

            args_str = ' '.join(quote(arg)for arg in sys.argv)
            f.write(f'alias python={sys.executable}\n')
            f.write(f'python {args_str}\n')

        _logger.info('save command to %s', self.experiment_dir / name)
Example #15
0
 class Args3(TypedArgs):
     foo: bool = add_argument('--foo', action='store_true')
     bar: bool = add_argument('--bar', action='store_false')
     baz: bool = add_argument('--baz', action='store_false')
Example #16
0
 class Args(TypedArgs):
     config: Optional[str] = add_argument('-c', '--config')
Example #17
0
 class Args(TypedArgs):
     config: str = add_argument('-c', '--config', default='/path')
Example #18
0
 class Args(ta.TypedArgs):
     foo: str = ta.add_argument('-f', '--foo')
     bar: str = ta.add_argument()
Example #19
0
 class Args1(TypedArgs):
     foo: Optional[str] = add_argument('--foo')
Example #20
0
 class Args(TypedArgs):
     foo: List[str] = add_argument('--foo', nargs=2)
     bar: List[str] = add_argument(nargs=1)
Example #21
0
 class Args2(TypedArgs):
     foo: int = add_argument('--foo', action='store_const', const=42)
Example #22
0
class Args(ta.TypedArgs):

    rank_start: int = ta.add_argument(
        '--rank-start', type=int,
        default=0,
    )
    world_size: int = ta.add_argument(
        '--world-size', type=int,
        default=1,
    )
    master_addr: str = ta.add_argument(
        '--master-addr', type=str,
        default='127.0.0.1'
    )
    master_port: int = ta.add_argument(
        '--master-port', type=int,
        default=29500
    )
    redirect_stdout_and_stderr: bool = ta.add_argument(
        '--redirect-stdout-and-stderr', action='store_true',
    )
    # 默认使用 cpu,后续可能加入 xla 支持
    gpu: List[int] = ta.add_argument(
        "--gpu", type=parse_gpu_list, default=[], help="指定gpu,`1,2,5-7 -> [1,2,5,6,7]`"
    )
    debug: bool = ta.add_argument(
        '-d', '--debug', action='store_true',
    )
    no_python: bool = ta.add_argument(
        '--no-python', action='store_true'
    )
    module: bool = ta.add_argument(
        '-m', '--module',
        action='store_true'
    )
    training_script: str = ta.add_argument(
        type=str,
    )
    training_script_args: List[str] = ta.add_argument(
        nargs=REMAINDER
    )
Example #23
0
 class Args4(TypedArgs):
     foo: List[str] = add_argument('--foo', action='append')
Example #24
0
class Args1(Args):
    foo: str = add_argument('--foo')
Example #25
0
 class Args(TypedArgs):
     verbose: int = add_argument('--verbose',
                                 '-v',
                                 action='count',
                                 default=0)
Example #26
0
class Args(ta.TypedArgs):
    foo: Optional[str] = ta.add_argument('--foo')
    bar: List[int] = ta.add_argument(nargs='*', default=[1, 2, 3])
Example #27
0
class Args(TypedArgs):
    config: Optional[str] = add_argument('-c',
                                         '--config',
                                         help='path to config')
    ext_config: List[str] = add_argument(
        '-x',
        '--ext-config',
        nargs='*',
        default=[],
        help='Extra jsonnet config',
    )
    debug: bool = add_argument('-d',
                               '--debug',
                               action='store_true',
                               help='debug flag')
    experiment_dir: Optional[Path] = add_argument('-e',
                                                  '--experiment-dir',
                                                  const=Path('temp') /
                                                  get_timestamp(),
                                                  nargs=argparse.OPTIONAL,
                                                  help='experiment dir')
    _run_dir: Optional[Path] = add_argument('--run-dir')

    def __repr__(self):
        d = self.__dict__.copy()
        d.pop('parser')
        return pformat(d)

    def save(self):
        with open(self.run_dir / 'run.sh', 'w') as f:
            f.write(f'cd {quote(os.getcwd())}\n')
            envs = ['CUDA_VISIBLE_DEVICES']
            for env in envs:
                value = os.environ.get(env, None)
                if value is not None:
                    f.write(f'export {env}={quote(value)}\n')
            f.write(sys.executable + ' ' +
                    ' '.join(quote(arg) for arg in sys.argv) + '\n')

    RUN_DIR_NAME_REGEX = re.compile('^run_(\d+)_')

    @property
    def run_dir(self):
        if self.experiment_dir is not None and self._run_dir is None:
            run_id = -1
            if self.experiment_dir.exists():
                for previous_runs in self.experiment_dir.iterdir():
                    match = self.RUN_DIR_NAME_REGEX.match(previous_runs.name)
                    if match is not None:
                        run_id = max(int(match.group(1)), run_id)

            run_id += 1
            self._run_dir = self.experiment_dir / f'run_{run_id}_{get_timestamp()}'
        return self._run_dir

    def make_run_dir(self):
        if self.experiment_dir is not None:
            self.experiment_dir.mkdir(parents=True, exist_ok=True)
            if not self.ask_for_replacing_older_dir(self.run_dir):
                raise EnvironmentError(f'Run dir "{self.run_dir}" exists')
            self.run_dir.mkdir(parents=True, exist_ok=False)

    def make_experiment_dir(self):
        if not self.ask_for_replacing_older_dir(self.experiment_dir):
            raise EnvironmentError(
                f'Experiment dir "{self.experiment_dir}" exists')
        self.run_dir.mkdir(parents=True, exist_ok=False)

    def ask_for_replacing_older_dir(self, dir_to_be_replaced: Path) -> bool:
        if not dir_to_be_replaced.exists():
            return True

        print(
            f'File exists: {dir_to_be_replaced}\nDo you want to remove it and create a new one?'
        )
        choice = input('Remove older directory? [y]es/[n]o: ')

        if choice in ['y', 'yes']:
            shutil.rmtree(dir_to_be_replaced)
            return True
        return False
Example #28
0
class Args(ta.TypedArgs):
    no_python: bool = ta.add_argument('--no-python')