コード例 #1
0
    def _parts_id(hyper, parts, short=False, hashed=False):
        id_parts = []
        for key, value in parts.items():
            if value is None:
                continue
            clsname, params = value
            type_str = clsname.split('.')[-1]
            id_parts.append(type_str)

            # Precidence of specifications (from lowest to highest)
            # SF=single flag, EF=explicit flag
            # SF-short, SF-hash, EF-short EF-hash
            request_short = short is True
            request_hash = hashed is True
            if (ub.iterable(short) and key in short):
                request_hash = False
                request_short = True
            if (ub.iterable(hashed) and key in hashed):
                request_hash = True
                request_short = False

            if request_hash:
                param_str = util.make_idstr(params)
                param_str = _hash_data(param_str)[0:6]
            elif request_short:
                param_str = util.make_short_idstr(params)
            else:
                param_str = util.make_idstr(params)

            if param_str:
                id_parts.append(param_str)
        idstr = ','.join(id_parts)
        return idstr
コード例 #2
0
ファイル: hyperparams.py プロジェクト: Erotemic/netharn
    def train_info(hyper, train_dpath=None):
        """
        Create json metadata that details enough information such that it would
        be possible for a human to reproduce the experiment.

        Example:
            >>> import netharn as nh
            >>> datasets = {
            >>>     'train': nh.data.ToyData2d(size=3, border=1, n=256, rng=0),
            >>>     'vali': nh.data.ToyData2d(size=3, border=1, n=128, rng=1),
            >>> }
            >>> hyper = nh.hyperparams.HyperParams(**{
            >>>     # --- Data First
            >>>     'datasets'    : datasets,
            >>>     'name'        : 'demo',
            >>>     'workdir'     : ub.ensure_app_cache_dir('netharn/demo'),
            >>>     'loaders'     : {'batch_size': 64},
            >>>     'xpu'         : nh.XPU.coerce('auto'),
            >>>     # --- Algorithm Second
            >>>     'model'       : (nh.models.ToyNet2d, {}),
            >>>     'optimizer'   : (nh.optimizers.SGD, {
            >>>         'lr': 0.001
            >>>     }),
            >>>     'criterion'   : (nh.criterions.CrossEntropyLoss, {}),
            >>>     #'criterion'   : (nh.criterions.FocalLoss, {}),
            >>>     'initializer' : (nh.initializers.KaimingNormal, {
            >>>         'param': 0,
            >>>     }),
            >>>     'scheduler'   : (nh.schedulers.ListedLR, {
            >>>         'step_points': {0: .001, 2: .01, 5: .015, 6: .005, 9: .001},
            >>>         'interpolate': True,
            >>>     }),
            >>>     'monitor'     : (nh.Monitor, {
            >>>         'max_epoch': 10
            >>>     }),
            >>> })
            >>> info = hyper.train_info()
            >>> print(ub.repr2(info))
        """
        given_explicit_train_dpath = train_dpath is not None
        # TODO: needs MASSIVE cleanup and organization

        # TODO: if pretrained is another netharn model, then we should read that
        # train_info if it exists and append it to a running list of train_info

        if hyper.model_cls is None:
            # import utool
            # utool.embed()
            raise ValueError('model_cls is None')
        # arch = hyper.model_cls.__name__

        train_dset = hyper.datasets.get('train', None)
        if train_dset is not None and hasattr(train_dset, 'input_id'):
            input_id = train_dset.input_id
            if callable(input_id):
                input_id = input_id()
        else:
            warnings.warn(
                'FitHarn cannot track the training dataset state because '
                'harn.datasets["train"] is missing the "input_id" attribute.')
            input_id = 'none'

        def _hash_data(data):
            return ub.hash_data(data, hasher='sha512', base='abc', types=True)

        train_hyper_id_long = hyper.hyper_id()
        train_hyper_id_brief = hyper.hyper_id(short=False, hashed=True)
        train_hyper_hashid = _hash_data(train_hyper_id_long)[:8]

        # TODO: hash this to some degree
        other_id = hyper.other_id()

        augment_json = hyper.augment_json()

        aug_brief = 'AU' + _hash_data(augment_json)[0:6]
        # extra_hash = _hash_data([hyper.centering])[0:6]

        train_id = '{}_{}_{}_{}'.format(
            _hash_data(input_id)[:6], train_hyper_id_brief, aug_brief,
            other_id)

        # Gather all information about this run into a single hash
        """
        NOTE:
            On choosing the length to truncate the hash.

            If we have an alphabet of size A=26, and we truncate to M=8
            samples, then the number of possible hash values is N = A ** M.
            The probability we will have a collision (assuming an ideal hash
            function where all outputs are equally likely) in r different
            inputs is given by the following function. Note this is the
            birthday paradox problem [1].


            ```python
            from scipy import exp, log
            from scipy.special import gammaln
            def prob_unique(N, r):
                return exp( gammaln(N+1) - gammaln(N-r+1) - r*log(N) )

            A = 26  # size of the alphabet for _hash_data
            M = 8   # number of characters we truncate at
            N = A ** M  # number of possible hash values

            r = 1000

            prob_collision = 1 - prob_unique(N, r)
            print('prob_collision = {!r}'.format(prob_collision))
            ```

            This is approximately 0.00056 or about 1 in 1784.
            When r = 10000, it becomes had to compute the number because of
            floating point errors, but the probability is likely astronomically
            low. I doubt we will ever run training in the same work directory
            (and with the same nice "name") 10,000 different times, so using an
            8 character hash seems safe and user friendly for this purpose.
            Perhaps we may move to 12, 16, or 32+ in the future, but for the
            pre 1.0 netharn, 8 seems fine.

            References:
                ..[1] https://www.johndcook.com/blog/2016/01/30/general-birthday-problem/

        """
        train_hashid = _hash_data(train_id)[0:8]

        name = hyper.name

        nice_dpath = None
        name_dpath = None
        if not given_explicit_train_dpath:
            # setup a cannonical and a linked symlink dir
            train_dpath = normpath(
                join(hyper.workdir, 'fit', 'runs', name, train_hashid))
            # also setup a custom "name", which may conflict. This will
            # overwrite an existing "name" symlink, but the real runs directory
            # is based on a hash, so it wont be overwritten with astronomicaly
            # high probability.
            if name:
                try:
                    name_dpath = normpath(
                        join(hyper.workdir, 'fit', 'name', name))
                    nice_dpath = normpath(
                        join(hyper.workdir, 'fit', 'nice', name))
                except Exception:
                    print('hyper.workdir = {!r}'.format(hyper.workdir))
                    print('hyper.name = {!r}'.format(hyper.name))
                    raise

        # make temporary initializer so we can infer the history
        temp_initializer = hyper.make_initializer()
        init_history = temp_initializer.history()

        train_info = ub.odict([
            ('train_hashid', train_hashid),
            ('train_id', train_id),
            ('workdir', hyper.workdir),
            ('aug_brief', aug_brief),
            ('input_id', input_id),
            ('other_id', other_id),
            ('hyper', hyper.get_initkw()),
            ('train_hyper_id_long', train_hyper_id_long),
            ('train_hyper_id_brief', train_hyper_id_brief),
            ('train_hyper_hashid', train_hyper_hashid),
            ('init_history', init_history),
            ('init_history_hashid', _hash_data(util.make_idstr(init_history))),
            ('name', hyper.name),
            ('nice', hyper.name),
            ('old_train_dpath',
             normpath(join(hyper.workdir, 'fit', 'runs', train_hashid))),
            ('train_dpath', train_dpath),
            # ('link_dpath', link_dpath),

            # "nice" will be deprecated for "name_dpath"
            ('nice_dpath', nice_dpath),
            ('name_dpath', name_dpath),
            ('given_explicit_train_dpath', given_explicit_train_dpath),

            # TODO, add in classes if applicable
            # TODO, add in centering if applicable
            # ('centering', hyper.centering),
            ('other', hyper.other),

            # HACKED IN
            ('augment', hyper.augment_json()),
            ('extra', hyper.extra),
            ('argv', sys.argv),
            ('hostname', platform.node()),
        ])
        return train_info
コード例 #3
0
    def train_info(self, short=True, hashed=True):
        """
        CommandLine:
            python ~/code/netharn/netharn/folders.py Folders.train_info

        Example:
            >>> import netharn as nh
            >>> datasets = {
            >>>     'train': nh.data.ToyData2d(size=3, border=1, n=256, rng=0),
            >>>     'vali': nh.data.ToyData2d(size=3, border=1, n=128, rng=1),
            >>> }
            >>> hyper = nh.hyperparams.HyperParams(**{
            >>>     # --- Data First
            >>>     'datasets'    : datasets,
            >>>     'nice'        : 'demo',
            >>>     'workdir'     : ub.ensure_app_cache_dir('netharn/demo'),
            >>>     'loaders'     : {'batch_size': 64},
            >>>     'xpu'         : nh.XPU.cast('auto'),
            >>>     # --- Algorithm Second
            >>>     'model'       : (nh.models.ToyNet2d, {}),
            >>>     'optimizer'   : (nh.optimizers.SGD, {
            >>>         'lr': 0.001
            >>>     }),
            >>>     'criterion'   : (nh.criterions.CrossEntropyLoss, {}),
            >>>     #'criterion'   : (nh.criterions.FocalLoss, {}),
            >>>     'initializer' : (nh.initializers.KaimingNormal, {
            >>>         'param': 0,
            >>>     }),
            >>>     'scheduler'   : (nh.schedulers.ListedLR, {
            >>>         'step_points': {0: .001, 2: .01, 5: .015, 6: .005, 9: .001},
            >>>         'interpolate': True,
            >>>     }),
            >>>     'monitor'     : (nh.Monitor, {
            >>>         'max_epoch': 10
            >>>     }),
            >>> })
            >>> folders = Folders(hyper)
            >>> info = folders.train_info()
            >>> print(ub.repr2(info))
        """
        # TODO: needs MASSIVE cleanup and organization

        # TODO: if pretrained is another netharn model, then we should read that
        # train_info if it exists and append it to a running list of train_info
        hyper = self.hyper

        if hyper.model_cls is None:
            # import utool
            # utool.embed()
            raise ValueError('model_cls is None')
        # arch = hyper.model_cls.__name__

        train_dset = hyper.datasets['train']
        if hasattr(train_dset, 'input_id'):
            input_id = train_dset.input_id
            if callable(input_id):
                input_id = input_id()
        else:
            input_id = 'none'

        train_hyper_id_long = hyper.hyper_id()
        train_hyper_id_brief = hyper.hyper_id(short=short, hashed=hashed)
        train_hyper_hashid = ub.hash_data(train_hyper_id_long)[:8]

        # TODO: hash this to some degree
        other_id = hyper.other_id()

        augment_json = hyper.augment_json()

        aug_brief = 'AU' + ub.hash_data(augment_json)[0:6]
        # extra_hash = ub.hash_data([hyper.centering])[0:6]

        train_id = '{}_{}_{}_{}'.format(
            ub.hash_data(input_id)[:6], train_hyper_id_brief,
            aug_brief, other_id)

        # Gather all information about this run into a single hash
        train_hashid = ub.hash_data(train_id)[0:8]

        # input_dname = 'input_' + input_id
        # verbose_dpath = join(self.hyper.workdir, 'fit', 'link', 'arch', arch, input_dname, train_id)
        hashed_dpath = join(self.hyper.workdir, 'fit', 'runs', train_hashid)

        # setup a cannonical and a linked symlink dir
        train_dpath = hashed_dpath
        # link_dpath = verbose_dpath

        # also setup a "nice" custom name, which may conflict, but oh well
        if hyper.nice:
            nice_dpath = join(self.hyper.workdir, 'fit', 'nice', hyper.nice)
        else:
            nice_dpath = None

        # make temporary initializer so we can infer the history
        temp_initializer = hyper.make_initializer()
        init_history = temp_initializer.history()

        train_info =  ub.odict([
            ('train_hashid', train_hashid),

            ('train_id', train_id),

            ('workdir', self.hyper.workdir),

            ('aug_brief', aug_brief),

            ('input_id', input_id),

            ('other_id', other_id),

            ('hyper', hyper.get_initkw()),

            ('train_hyper_id_long', train_hyper_id_long),
            ('train_hyper_id_brief', train_hyper_id_brief),
            ('train_hyper_hashid', train_hyper_hashid),
            ('init_history', init_history),
            ('init_history_hashid', ub.hash_data(util.make_idstr(init_history))),

            ('nice', hyper.nice),

            ('train_dpath', train_dpath),
            # ('link_dpath', link_dpath),
            ('nice_dpath', nice_dpath),

            # TODO, add in n_classes if applicable
            # TODO, add in centering if applicable
            # ('centering', hyper.centering),

            # HACKED IN
            ('augment', hyper.augment_json()),
        ])
        return train_info
コード例 #4
0
    def train_info(self, train_dpath=None, short=True, hashed=True):
        """
        TODO: maybe this doesn't belong in folders?

        CommandLine:
            python ~/code/netharn/netharn/folders.py Folders.train_info

        Example:
            >>> import netharn as nh
            >>> datasets = {
            >>>     'train': nh.data.ToyData2d(size=3, border=1, n=256, rng=0),
            >>>     'vali': nh.data.ToyData2d(size=3, border=1, n=128, rng=1),
            >>> }
            >>> hyper = nh.hyperparams.HyperParams(**{
            >>>     # --- Data First
            >>>     'datasets'    : datasets,
            >>>     'nice'        : 'demo',
            >>>     'workdir'     : ub.ensure_app_cache_dir('netharn/demo'),
            >>>     'loaders'     : {'batch_size': 64},
            >>>     'xpu'         : nh.XPU.cast('auto'),
            >>>     # --- Algorithm Second
            >>>     'model'       : (nh.models.ToyNet2d, {}),
            >>>     'optimizer'   : (nh.optimizers.SGD, {
            >>>         'lr': 0.001
            >>>     }),
            >>>     'criterion'   : (nh.criterions.CrossEntropyLoss, {}),
            >>>     #'criterion'   : (nh.criterions.FocalLoss, {}),
            >>>     'initializer' : (nh.initializers.KaimingNormal, {
            >>>         'param': 0,
            >>>     }),
            >>>     'scheduler'   : (nh.schedulers.ListedLR, {
            >>>         'step_points': {0: .001, 2: .01, 5: .015, 6: .005, 9: .001},
            >>>         'interpolate': True,
            >>>     }),
            >>>     'monitor'     : (nh.Monitor, {
            >>>         'max_epoch': 10
            >>>     }),
            >>> })
            >>> folders = Folders(hyper)
            >>> info = folders.train_info()
            >>> print(ub.repr2(info))
        """
        given_explicit_train_dpath = train_dpath is not None
        # TODO: needs MASSIVE cleanup and organization

        # TODO: if pretrained is another netharn model, then we should read that
        # train_info if it exists and append it to a running list of train_info
        hyper = self.hyper

        if hyper.model_cls is None:
            # import utool
            # utool.embed()
            raise ValueError('model_cls is None')
        # arch = hyper.model_cls.__name__

        train_dset = hyper.datasets.get('train', None)
        if train_dset is not None and hasattr(train_dset, 'input_id'):
            input_id = train_dset.input_id
            if callable(input_id):
                input_id = input_id()
        else:
            warnings.warn(
                'FitHarn cannot track the training dataset state because '
                'harn.datasets["train"] is missing the "input_id" attribute.')
            input_id = 'none'

        def _hash_data(data):
            return ub.hash_data(data, hasher='sha512', base='abc', types=True)

        train_hyper_id_long = hyper.hyper_id()
        train_hyper_id_brief = hyper.hyper_id(short=short, hashed=hashed)
        train_hyper_hashid = _hash_data(train_hyper_id_long)[:8]

        # TODO: hash this to some degree
        other_id = hyper.other_id()

        augment_json = hyper.augment_json()

        aug_brief = 'AU' + _hash_data(augment_json)[0:6]
        # extra_hash = _hash_data([hyper.centering])[0:6]

        train_id = '{}_{}_{}_{}'.format(
            _hash_data(input_id)[:6], train_hyper_id_brief, aug_brief,
            other_id)

        # Gather all information about this run into a single hash
        train_hashid = _hash_data(train_id)[0:8]

        nice = hyper.nice

        nice_dpath = None
        if not given_explicit_train_dpath:
            # setup a cannonical and a linked symlink dir
            train_dpath = normpath(
                join(self.hyper.workdir, 'fit', 'runs', nice, train_hashid))
            # also setup a "nice" custom name, which may conflict, but oh well
            if nice:
                try:
                    nice_dpath = normpath(
                        join(self.hyper.workdir, 'fit', 'nice', nice))
                except Exception:
                    print('self.hyper.workdir = {!r}'.format(
                        self.hyper.workdir))
                    print('hyper.nice = {!r}'.format(hyper.nice))
                    raise

        # make temporary initializer so we can infer the history
        temp_initializer = hyper.make_initializer()
        init_history = temp_initializer.history()

        train_info = ub.odict([
            ('train_hashid', train_hashid),
            ('train_id', train_id),
            ('workdir', self.hyper.workdir),
            ('aug_brief', aug_brief),
            ('input_id', input_id),
            ('other_id', other_id),
            ('hyper', hyper.get_initkw()),
            ('train_hyper_id_long', train_hyper_id_long),
            ('train_hyper_id_brief', train_hyper_id_brief),
            ('train_hyper_hashid', train_hyper_hashid),
            ('init_history', init_history),
            ('init_history_hashid', _hash_data(util.make_idstr(init_history))),
            ('nice', hyper.nice),
            ('old_train_dpath',
             normpath(join(self.hyper.workdir, 'fit', 'runs', train_hashid))),
            ('train_dpath', train_dpath),
            # ('link_dpath', link_dpath),
            ('nice_dpath', nice_dpath),
            ('given_explicit_train_dpath', given_explicit_train_dpath),

            # TODO, add in n_classes if applicable
            # TODO, add in centering if applicable
            # ('centering', hyper.centering),
            ('other', hyper.other),

            # HACKED IN
            ('augment', hyper.augment_json()),
            ('extra', hyper.extra),
            ('argv', sys.argv),
            ('hostname', platform.node()),
        ])
        return train_info