def load_ensemble_weights(self, norm_sample, task_name, ens):
     # -- load the weights from the most recent ensemble, if there is one.
     for trial in self.history[-1:]:
         info('Loading weights from document %i' % trial['tid'])
         trial_norm_key = self.norm_key(norm_sample, tid=trial['tid'])
         trial_weights = trial['result']['weights']
         norm_task_weights = trial_weights[trial_norm_key][task_name]
         for norm_key, weight in norm_task_weights.items():
             if ens.has_member(norm_key):
                 ens.set_weight(norm_key, weight)
             else:
                 ens.add_member(norm_key, weight)
             info(' .. weight[%s] = %s' % (norm_key, weight))
             foobar.append_trace('load ensemble weights', norm_key, weight)
Example #2
0
def unsup_images(data_view, trn, N):
    """
    Return a block of 
    """
    if trn == 'DevTrain':
        # -- extract training images, and put them into channel-major format
        imgs = larray.reindex(data_view.image_pixels,
                data_view.dev_train['lpathidx'][0, :N])[:]
        imgs = np.asarray(imgs)
        assert 'int' in str(imgs.dtype)
        foobar.append_ndarray_signature(imgs, 'unsup_images')
        foobar.append_trace('unsup_images N', N)
        return imgs.transpose(0, 3, 1, 2).copy()
    else:
        raise NotImplementedError()
Example #3
0
    def load_ensemble_history(self, fields):

        trials = self.ctrl.trials
        if hasattr(trials, 'handle'):
            # query mongodb directly to avoid transferring un-necessary fields
            docs_for_bh = BoostHelper.query_MongoTrials(
                trials,
                fields=fields)
            # download only those docs that are in the active history
            trials.refresh_tids([d['tid'] for d in docs_for_bh])
            # -- XXX: relatively arbitrary assert to make sure we didn't
            # download a whole wack of documents... the point of
            # refresh_tids is to avoid this.
            assert len(trials.trials) < len(docs_for_bh) + 5, (
                len(trials.trials), len(docs_for_bh))
        else:
            trials.refresh()
            docs_for_bh = trials.trials

        def helper():
            bh = BoostHelper(docs_for_bh)

            if self.ctrl.current_trial is None:
                history = []
            else:
                history = bh.history(self.ctrl.current_trial)
                assert history[-1] is self.ctrl.current_trial
                history.pop(-1)
            info('load_ensemble_history: %i previous model documents found'
                    % len(history))
            return history

        retries = 20
        while retries:
            history = helper()
            if any(trial['result'].get('in_progress') for trial in history):
                warn('Previous trial is still in progress, waiting 30s')
                time.sleep(30)
                retries -= 1
            else:
                break

        foobar.append_trace('load ensemble history len', len(history))

        if retries:
            self.history = history
        else:
            raise Exception('Previous trial in progress, cannot continue')
    def load_ensemble_history(self, fields):

        trials = self.ctrl.trials
        if hasattr(trials, 'handle'):
            # query mongodb directly to avoid transferring un-necessary fields
            docs_for_bh = BoostHelper.query_MongoTrials(trials, fields=fields)
            # download only those docs that are in the active history
            trials.refresh_tids([d['tid'] for d in docs_for_bh])
            # -- XXX: relatively arbitrary assert to make sure we didn't
            # download a whole wack of documents... the point of
            # refresh_tids is to avoid this.
            assert len(trials.trials) < len(docs_for_bh) + 5, (len(
                trials.trials), len(docs_for_bh))
        else:
            trials.refresh()
            docs_for_bh = trials.trials

        def helper():
            bh = BoostHelper(docs_for_bh)

            if self.ctrl.current_trial is None:
                history = []
            else:
                history = bh.history(self.ctrl.current_trial)
                assert history[-1] is self.ctrl.current_trial
                history.pop(-1)
            info('load_ensemble_history: %i previous model documents found' %
                 len(history))
            return history

        retries = 20
        while retries:
            history = helper()
            if any(trial['result'].get('in_progress') for trial in history):
                warn('Previous trial is still in progress, waiting 30s')
                time.sleep(30)
                retries -= 1
            else:
                break

        foobar.append_trace('load ensemble history len', len(history))

        if retries:
            self.history = history
        else:
            raise Exception('Previous trial in progress, cannot continue')
Example #5
0
def random_patches(images, N, R, C, rng, channel_major=False, memlimit=None):
    """Return a stack of N image patches (channel major version)"""

    def N_with_memlimit():
        if memlimit is not None:
            # -- memlimit in bytes
            sizelimit = memlimit / images.dtype.itemsize
            return min(N, sizelimit // (R * C * iF))
        else:
            return N

    if channel_major:
        n_imgs, iF, iR, iC = images.shape
        N = N_with_memlimit()
        rval = np.empty((N, iF, R, C), dtype=images.dtype)
    else:
        n_imgs, iR, iC, iF = images.shape
        N = N_with_memlimit()
        rval = np.empty((N, R, C, iF), dtype=images.dtype)

    foobar.append_trace('random_patches dims', *rval.shape)
    foobar.append_randomstate('random_patches rng', rng)

    srcs = rng.randint(n_imgs, size=N)

    if R > iR or C > iC:
        raise InvalidDescription('cannot extract patches', (R, C))
    roffsets = rng.randint(iR - R + 1, size=N)
    coffsets = rng.randint(iC - C + 1, size=N)
    # TODO: this can be done with one advanced index right?
    for rv_i, src_i, ro, co in zip(rval, srcs, roffsets, coffsets):
        if channel_major:
            rv_i[:] = images[src_i, :, ro: ro + R, co : co + C]
        else:
            rv_i[:] = images[src_i, ro: ro + R, co : co + C]
    foobar.append_ndarray_signature(rval, 'random_patches rval')
    return rval
Example #6
0
def random_patches(images, N, R, C, rng, channel_major=False, memlimit=None):
    """Return a stack of N image patches (channel major version)"""
    def N_with_memlimit():
        if memlimit is not None:
            # -- memlimit in bytes
            sizelimit = memlimit / images.dtype.itemsize
            return min(N, sizelimit // (R * C * iF))
        else:
            return N

    if channel_major:
        n_imgs, iF, iR, iC = images.shape
        N = N_with_memlimit()
        rval = np.empty((N, iF, R, C), dtype=images.dtype)
    else:
        n_imgs, iR, iC, iF = images.shape
        N = N_with_memlimit()
        rval = np.empty((N, R, C, iF), dtype=images.dtype)

    foobar.append_trace('random_patches dims', *rval.shape)
    foobar.append_randomstate('random_patches rng', rng)

    srcs = rng.randint(n_imgs, size=N)

    if R > iR or C > iC:
        raise InvalidDescription('cannot extract patches', (R, C))
    roffsets = rng.randint(iR - R + 1, size=N)
    coffsets = rng.randint(iC - C + 1, size=N)
    # TODO: this can be done with one advanced index right?
    for rv_i, src_i, ro, co in zip(rval, srcs, roffsets, coffsets):
        if channel_major:
            rv_i[:] = images[src_i, :, ro:ro + R, co:co + C]
        else:
            rv_i[:] = images[src_i, ro:ro + R, co:co + C]
    foobar.append_ndarray_signature(rval, 'random_patches rval')
    return rval
        def train_main():
            ens.train_sample = task.name

            t0 = time.time()
            if valid is None:
                svm_dct['l2_reg'] = pipeline['l2_reg']
                ens.fit_svm(svm_dct['l2_reg'])
                svm_dct['train_error'] = ens.error_rate(task.name)
                svm_dct['loss'] = svm_dct['train_error']
            else:

                #scales = {m: 3.0 for m in ens._weights}
                scales = dict([(m, 3.0) for m in ens._weights])
                scales[norm_key] = 100.0

                info('fit_weights_crossvalid(%s, %i)' % (
                    valid.name, self.svm_crossvalid_max_evals))
                ens.fit_weights_crossvalid(valid.name,
                        max_evals=self.svm_crossvalid_max_evals,
                        scales=scales)

                foobar.append_trace('xvalid weights', sorted(ens._weights.items()))

                svm_dct['task_error'] = ens.error_rate(task.name)
                foobar.append_trace('task_error', svm_dct['task_error'])

                svm_dct['valid_name'] = valid.name
                svm_dct['valid_error'] = ens.error_rate(valid.name)
                info('valid_error %f' % svm_dct['valid_error'])
                foobar.append_trace('valid_error', svm_dct['valid_error'])

                svm_dct['l2_reg'] = None  # -- use default when retraining

                # -- re-fit the model using best weights on train + valid sets
                ens.train_sample = train_valid
                ens.fit_svm()

            fit_time = time.time() - t0
            svm_dct['fit_time'] = fit_time
    def normalized_image_match_features(self,
                                        task,
                                        svm_dct,
                                        role,
                                        batched_lmap_speed_thresh=None):
        assert role in ('train', 'test')
        if batched_lmap_speed_thresh is None:
            batched_lmap_speed_thresh = self.batched_lmap_speed_thresh
        image_features, cdict = self.get_image_features(
            task, batched_lmap_speed_thresh=batched_lmap_speed_thresh)
        del cdict  # -- no longer used (waste of memory)
        pipeline = self.pipeline
        info('Indexing into image_features of shape %s' %
             str(image_features.shape))

        comps = [getattr(comparisons, cc) for cc in self.comparison_names]
        n_features = np.prod(image_features.shape[1:])
        n_trn = len(task.lidx)

        x_trn_shp = (n_trn, len(comps), n_features)
        info('Allocating training ndarray of shape %s' % str(x_trn_shp))
        x_trn = np.empty(x_trn_shp, dtype='float32')

        # -- pre-compute all of the image_features we will need
        all_l_features = reindex(image_features, task.lidx)[:]
        all_r_features = reindex(image_features, task.ridx)[:]

        all_l_features = all_l_features.reshape(len(all_l_features), -1)
        all_r_features = all_r_features.reshape(len(all_r_features), -1)

        foobar.append_ndarray_signature(all_l_features,
                                        'normalized_image_match l_features',
                                        task.name)
        foobar.append_ndarray_signature(all_r_features,
                                        'normalized_image_match r_features',
                                        task.name)

        if role == 'train':
            if np.allclose(all_l_features.var(axis=0), 0.0):
                raise ValueError('Homogeneous features (non-finite features)')

            xmean_l, xstd_l = mean_and_std(all_l_features,
                                           remove_std0=pipeline['remove_std0'])
            xmean_r, xstd_r = mean_and_std(all_r_features,
                                           remove_std0=pipeline['remove_std0'])
            xmean = (xmean_l + xmean_r) / 2.0
            # -- this is an ad-hoc way of blending the variances.
            xstd = np.sqrt(
                np.maximum(xstd_l, xstd_r)**2 + pipeline['varthresh'])

            foobar.append_ndarray_signature(xmean,
                                            'normalized_image_match xmean',
                                            task.name)
            foobar.append_ndarray_signature(xstd,
                                            'normalized_image_match xstd',
                                            task.name)

            svm_dct['xmean'] = xmean
            svm_dct['xstd'] = xstd
        else:
            xmean = svm_dct['xmean']
            xstd = svm_dct['xstd']

        info('Computing comparison features')

        # -- now compute the "comparison functions" into x_trn
        for jj, (lfeat, rfeat) in enumerate(zip(all_l_features,
                                                all_r_features)):
            lfeat_z = (lfeat - xmean) / xstd
            rfeat_z = (rfeat - xmean) / xstd
            for ci, comp in enumerate(comps):
                x_trn[jj, ci, :] = comp(lfeat_z, rfeat_z)

        if pipeline['divrowl2']:
            info('Dividing by feature norms')
            # -- now normalize by average feature norm because some
            #    comparison functions come out smaller than others
            if role == 'train':
                svm_dct['divrowl2_avg_nrm'] = {}
                for ci, cname in enumerate(self.comparison_names):
                    avg_nrm = average_row_l2norm(x_trn[:, ci, :]) + 1e-7
                    svm_dct['divrowl2_avg_nrm'][cname] = avg_nrm

            avg_nrm_vec = [
                svm_dct['divrowl2_avg_nrm'][cname]
                for cname in self.comparison_names
            ]
            x_trn /= np.asarray(avg_nrm_vec)[None, :, None]
            foobar.append_trace('get_normlized_features avg_nrm', avg_nrm_vec)

        # -- collapse comparison and feature dimensions
        x_trn.shape = (x_trn.shape[0], x_trn.shape[1] * x_trn.shape[2])

        foobar.append_ndarray_signature(x_trn, 'normalized_image_match x_trn',
                                        task.name)
        info('normalized_image_match_features complete')
        return x_trn
Example #9
0
    def normalized_image_match_features(self, task, svm_dct, role,
            batched_lmap_speed_thresh=None):
        assert role in ('train', 'test')
        if batched_lmap_speed_thresh is None:
            batched_lmap_speed_thresh = self.batched_lmap_speed_thresh
        image_features, cdict = self.get_image_features(task,
                batched_lmap_speed_thresh=batched_lmap_speed_thresh)
        del cdict # -- no longer used (waste of memory)
        pipeline = self.pipeline
        info('Indexing into image_features of shape %s' %
                str(image_features.shape))

        comps = [getattr(comparisons, cc)
                for cc in self.comparison_names]
        n_features = np.prod(image_features.shape[1:])
        n_trn = len(task.lidx)

        x_trn_shp = (n_trn, len(comps), n_features)
        info('Allocating training ndarray of shape %s' % str(x_trn_shp))
        x_trn = np.empty(x_trn_shp, dtype='float32')

        # -- pre-compute all of the image_features we will need
        all_l_features = reindex(image_features, task.lidx)[:]
        all_r_features = reindex(image_features, task.ridx)[:]

        all_l_features = all_l_features.reshape(len(all_l_features), -1)
        all_r_features = all_r_features.reshape(len(all_r_features), -1)

        foobar.append_ndarray_signature(all_l_features,
            'normalized_image_match l_features', task.name)
        foobar.append_ndarray_signature(all_r_features,
            'normalized_image_match r_features', task.name)

        if role == 'train':
            if np.allclose(all_l_features.var(axis=0), 0.0):
                raise ValueError(
                    'Homogeneous features (non-finite features)')

            xmean_l, xstd_l = mean_and_std(all_l_features,
                    remove_std0=pipeline['remove_std0'])
            xmean_r, xstd_r = mean_and_std(all_r_features,
                    remove_std0=pipeline['remove_std0'])
            xmean = (xmean_l + xmean_r) / 2.0
            # -- this is an ad-hoc way of blending the variances.
            xstd = np.sqrt(np.maximum(xstd_l, xstd_r) ** 2
                           + pipeline['varthresh'])

            foobar.append_ndarray_signature(
                xmean, 'normalized_image_match xmean', task.name)
            foobar.append_ndarray_signature(
                xstd, 'normalized_image_match xstd', task.name)


            svm_dct['xmean'] = xmean
            svm_dct['xstd'] = xstd
        else:
            xmean = svm_dct['xmean']
            xstd = svm_dct['xstd']

        info('Computing comparison features')

        # -- now compute the "comparison functions" into x_trn
        for jj, (lfeat, rfeat) in enumerate(
                zip(all_l_features, all_r_features)):
            lfeat_z = (lfeat - xmean) / xstd
            rfeat_z = (rfeat - xmean) / xstd
            for ci, comp in enumerate(comps):
                x_trn[jj, ci, :] = comp(lfeat_z, rfeat_z)

        if pipeline['divrowl2']:
            info('Dividing by feature norms')
            # -- now normalize by average feature norm because some
            #    comparison functions come out smaller than others
            if role == 'train':
                svm_dct['divrowl2_avg_nrm'] = {}
                for ci, cname in enumerate(self.comparison_names):
                    avg_nrm = average_row_l2norm(x_trn[:, ci, :]) + 1e-7
                    svm_dct['divrowl2_avg_nrm'][cname] = avg_nrm

            avg_nrm_vec = [svm_dct['divrowl2_avg_nrm'][cname]
                           for cname in self.comparison_names]
            x_trn /= np.asarray(avg_nrm_vec)[None, :, None]
            foobar.append_trace('get_normlized_features avg_nrm', avg_nrm_vec)

        # -- collapse comparison and feature dimensions
        x_trn.shape = (x_trn.shape[0], x_trn.shape[1] * x_trn.shape[2])

        foobar.append_ndarray_signature(
            x_trn, 'normalized_image_match x_trn', task.name)
        info('normalized_image_match_features complete')
        return x_trn