Beispiel #1
0
    def deploy(self, task_key=None, publish=False):
        """
        Trains and saves a classifier for deployment

        Notes:
            A deployment consists of the following information
                * The classifier itself
                * Information needed to construct the input to the classifier
                    - TODO: can this be encoded as an sklearn pipeline?
                * Metadata concerning what data the classifier was trained with
                * PUBLISH TO /media/hdd/PUBLIC/models/pairclf

        Example:
            >>> # xdoctest: +REQUIRES(module:wbia_cnn)
            >>> from wbia.algo.verif.vsone import *  # NOQA
            >>> params = dict(sample_method='random')
            >>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params)
            >>> pblm.setup(with_simple=False)
            >>> task_key = pblm.primary_task_key
            >>> self = Deployer(dpath='.', pblm=pblm)
            >>> deploy_info = self.deploy()

        Ignore:
            pblm.evaluate_classifiers(with_simple=False)
            res = pblm.task_combo_res[pblm.primary_task_key]['RF']['learn(sum,glob)']
        """
        deploy_info = self._make_deploy_info(task_key=task_key)
        deploy_fname = deploy_info['metadata']['fname']

        meta_fname = deploy_fname + self.meta_suffix
        deploy_fpath = join(self.dpath, deploy_fname)
        meta_fpath = join(self.dpath, meta_fname)

        ut.save_json(meta_fpath, deploy_info['metadata'])
        ut.save_data(deploy_fpath, deploy_info)

        if publish:
            user = ut.get_user_name()
            remote_uri = '{user}@{remote}:{path}'.format(user=user,
                                                         **self.publish_info)

            ut.rsync(meta_fpath, remote_uri + '/' + meta_fname)
            ut.rsync(deploy_fpath, remote_uri + '/' + deploy_fname)
        return deploy_info
Beispiel #2
0
 def add_split(dataset, key, idxs):
     print('[dataset] adding split %r' % (key,))
     # Build subset filenames
     ut.ensuredir(dataset.split_dpath)
     ext = dataset._ext
     fmtdict = dict(key=key, ext=ext, size=len(idxs))
     fmtstr = dataset.get_split_fmtstr(forward=True)
     splitset = {
         type_: join(dataset.split_dpath, fmtstr.format(type_=type_, **fmtdict))
         for type_ in ['data', 'labels', 'metadata']
     }
     # Partition data into the subset
     part_dict = {
         'data': dataset.data.take(idxs, axis=0),
         'labels': dataset.labels.take(idxs, axis=0),
     }
     if dataset.metadata is not None:
         taker = ut.partial(ut.take, index_list=idxs)
         part_dict['metadata'] = ut.map_dict_vals(taker, dataset.metadata)
     # Write splitset data to files
     for type_ in part_dict.keys():
         ut.save_data(splitset[type_], part_dict[type_])
     # Register filenames with dataset
     dataset.fpath_dict[key] = splitset
Beispiel #3
0
 def save(dataset, data, labels, metadata=None, data_per_label=1):
     ut.save_data(dataset.data_fpath, data)
     ut.save_data(dataset.labels_fpath, labels)
     if metadata is not None:
         ut.save_data(dataset.metadata_fpath, metadata)
     else:
         dataset.fpath_dict['full']['metadata'] = None
     # cache the data because it is likely going to be used to define a
     # splitset
     dataset.subset_data.cache['full'] = data
     dataset.subset_labels.cache['full'] = labels
     dataset.subset_metadata.cache['full'] = metadata
     # Infer the rest of the required data info
     dataset._info['num_labels'] = len(labels)
     try:
         dataset._info['unique_labels'] = np.unique(labels)
     except ValueError:
         dataset._info['unique_labels'] = np.nan
     dataset._info['data_per_label'] = data_per_label
     ut.save_data(dataset.info_fpath, dataset._info)
def load_oxford_2013():
    """
    Found this data in README of SMK publication
    https://hal.inria.fr/hal-00864684/document
    http://people.rennes.inria.fr/Herve.Jegou/publications.html
    with download script

    CommandLine:
        # Download oxford13 data
        cd ~/work/Oxford
        mkdir -p smk_data_iccv_2013
        cd smk_data_iccv_2013
        wget -nH --cut-dirs=4 -r -Pdata/ ftp://ftp.irisa.fr/local/texmex/corpus/iccv2013/

    This dataset has 5063 images wheras 07 has 5062
    This dataset seems to contain an extra junk image:
        ashmolean_000214

    # Remember that matlab is 1 indexed!
    # DONT FORGET TO CONVERT TO 0 INDEXING!
    """
    from yael.ynumpy import fvecs_read
    from yael.yutils import load_ext
    import scipy.io
    import vtool as vt
    from os.path import join

    dbdir = ut.truepath('/raid/work/Oxford/')
    datadir = dbdir + '/smk_data_iccv_2013/data/'

    # we are not retraining, so this is unused
    # # Training data descriptors for Paris6k dataset
    # train_sift_fname = join(datadir, 'paris_sift.uint8')  # NOQA
    # # File storing visual words of Paris6k descriptors used in our ICCV paper
    # train_vw_fname = join(datadir, 'clust_preprocessed/oxford_train_vw.int32')

    # Pre-learned quantizer used in ICCV paper (used if docluster=false)
    codebook_fname = join(datadir, 'clust_preprocessed/oxford_codebook.fvecs')

    # Files storing descriptors/geometry for Oxford5k dataset
    test_sift_fname = join(datadir, 'oxford_sift.uint8')
    test_geom_fname = join(datadir, 'oxford_geom_sift.float')
    test_nf_fname = join(datadir, 'oxford_nsift.uint32')

    # File storing visual words of Oxford5k descriptors used in our ICCV paper
    test_vw_fname = join(datadir, 'clust_preprocessed/oxford_vw.int32')
    # Ground-truth for Oxford dataset
    gnd_fname = join(datadir, 'gnd_oxford.mat')

    oxford_vecs = load_ext(test_sift_fname, ndims=128, verbose=True)
    oxford_nfeats = load_ext(test_nf_fname, verbose=True)
    oxford_words = fvecs_read(codebook_fname)
    oxford_wids = load_ext(test_vw_fname, verbose=True) - 1

    test_geom_invV_fname = test_geom_fname + '.invV.pkl'
    try:
        all_kpts = ut.load_data(test_geom_invV_fname)
        logger.info('loaded invV keypoints')
    except IOError:
        oxford_kptsZ = load_ext(test_geom_fname, ndims=5, verbose=True)
        logger.info('converting to invV keypoints')
        all_kpts = vt.convert_kptsZ_to_kpts(oxford_kptsZ)
        ut.save_data(test_geom_invV_fname, all_kpts)

    gnd_ox = scipy.io.loadmat(gnd_fname)
    imlist = [x[0][0] for x in gnd_ox['imlist']]
    qx_to_dx = gnd_ox['qidx'] - 1

    data_uri_order = imlist
    query_uri_order = ut.take(data_uri_order, qx_to_dx)

    offset_list = np.hstack(([0], oxford_nfeats.cumsum())).astype(np.int64)

    # query_gnd = gnd_ox['gnd'][0][0]
    # bboxes = query_gnd[0]
    # qx_to_ok_gtidxs1 = [x[0] for x in query_gnd[1][0]]
    # qx_to_junk_gtidxs2 = [x[0] for x in query_gnd[2][0]]
    # # ut.depth_profile(qx_to_gtidxs1)
    # # ut.depth_profile(qx_to_gtidxs2)

    assert sum(oxford_nfeats) == len(oxford_vecs)
    assert offset_list[-1] == len(oxford_vecs)
    assert len(oxford_wids) == len(oxford_vecs)
    assert oxford_wids.max() == len(oxford_words) - 1

    data = {
        'offset_list': offset_list,
        'all_kpts': all_kpts,
        'all_vecs': oxford_vecs,
        'words': oxford_words,
        'idx_to_wx': oxford_wids,
        'data_uri_order': data_uri_order,
        'query_uri_order': query_uri_order,
    }
    return data
def load_oxford_2007():
    """
    Loads data from
    http://www.robots.ox.ac.uk:5000/~vgg/publications/2007/Philbin07/philbin07.pdf

    >>> from wbia.algo.smk.script_smk import *  # NOQA
    """
    from os.path import join, basename, splitext
    import pandas as pd
    import vtool as vt

    dbdir = ut.truepath('/raid/work/Oxford/')
    data_fpath0 = join(dbdir, 'data_2007.pkl')

    if ut.checkpath(data_fpath0):
        data = ut.load_data(data_fpath0)
        return data
    else:
        word_dpath = join(dbdir, 'word_oxc1_hesaff_sift_16M_1M')
        _word_fpath_list = ut.ls(word_dpath)
        imgid_to_word_fpath = {
            splitext(basename(word_fpath))[0]: word_fpath
            for word_fpath in _word_fpath_list
        }
        readme_fpath = join(dbdir, 'README2.txt')
        imgid_order = ut.readfrom(readme_fpath).split('\n')[20:-1]

        imgid_order = imgid_order
        data_uri_order = [x.replace('oxc1_', '') for x in imgid_order]

        imgid_to_df = {}
        for imgid in ut.ProgIter(imgid_order, label='reading kpts'):
            word_fpath = imgid_to_word_fpath[imgid]
            row_gen = (map(float,
                           line.strip('\n').split(' '))
                       for line in ut.read_lines_from(word_fpath)[2:])
            rows = [(int(word_id), x, y, e11, e12, e22)
                    for (word_id, x, y, e11, e12, e22) in row_gen]
            df = pd.DataFrame(
                rows, columns=['word_id', 'x', 'y', 'e11', 'e12', 'e22'])
            imgid_to_df[imgid] = df

        df_list = ut.take(imgid_to_df, imgid_order)

        nfeat_list = [len(df_) for df_ in df_list]
        offset_list = [0] + ut.cumsum(nfeat_list)
        shape = (offset_list[-1], 128)
        # shape = (16334970, 128)
        sift_fpath = join(dbdir, 'OxfordSIFTDescriptors',
                          'feat_oxc1_hesaff_sift.bin')
        try:
            file_ = open(sift_fpath, 'rb')
            with ut.Timer('Reading SIFT binary file'):
                nbytes = np.prod(shape)
                all_vecs = np.fromstring(file_.read(nbytes), dtype=np.uint8)
            all_vecs = all_vecs.reshape(shape)
        finally:
            file_.close()

        kpts_list = [
            df_.loc[:, ('x', 'y', 'e11', 'e12', 'e22')].values
            for df_ in df_list
        ]
        wordid_list = [df_.loc[:, 'word_id'].values for df_ in df_list]
        kpts_Z = np.vstack(kpts_list)
        idx_to_wx = np.hstack(wordid_list)

        # assert len(np.unique(idx_to_wx)) == 1E6

        # Reqd standard query order
        query_files = sorted(
            ut.glob(dbdir + '/oxford_groundtruth', '*_query.txt'))
        query_uri_order = []
        for qpath in query_files:
            text = ut.readfrom(qpath, verbose=0)
            query_uri = text.split(' ')[0].replace('oxc1_', '')
            query_uri_order.append(query_uri)

        logger.info('converting to invV')
        all_kpts = vt.convert_kptsZ_to_kpts(kpts_Z)

        data = {
            'offset_list': offset_list,
            'all_kpts': all_kpts,
            'all_vecs': all_vecs,
            'idx_to_wx': idx_to_wx,
            'data_uri_order': data_uri_order,
            'query_uri_order': query_uri_order,
        }
        ut.save_data(data_fpath0, data)
    return data
Beispiel #6
0
def merge_datasets(dataset_list):
    """
    Merges a list of dataset objects into a single combined dataset.
    """
    def consensus_check_factory():
        """
        Returns a temporary function used to check that all incoming values
        with the same key are consistent
        """
        from collections import defaultdict
        past_values = defaultdict(lambda: None)

        def consensus_check(value, key):
            assert past_values[key] is None or past_values[key] == value, (
                'key=%r with value=%r does not agree with past_value=%r' %
                (key, value, past_values[key]))
            past_values[key] = value
            return value

        return consensus_check

    total_num_labels = 0
    total_num_data = 0

    input_alias_list = [dataset.alias_key for dataset in dataset_list]

    alias_key = 'combo_' + ut.hashstr27(repr(input_alias_list), hashlen=8)
    training_dpath = ut.ensure_app_resource_dir('ibeis_cnn', 'training',
                                                alias_key)
    data_fpath = ut.unixjoin(training_dpath, alias_key + '_data.hdf5')
    labels_fpath = ut.unixjoin(training_dpath, alias_key + '_labels.hdf5')

    try:
        # Try and short circut cached loading
        merged_dataset = DataSet.from_alias_key(alias_key)
        return merged_dataset
    except (Exception, AssertionError) as ex:
        ut.printex(ex,
                   'alias definitions have changed. alias_key=%r' %
                   (alias_key, ),
                   iswarning=True)

    # Build the dataset
    consensus_check = consensus_check_factory()

    for dataset in dataset_list:
        print(ut.get_file_nBytes_str(dataset.data_fpath))
        print(dataset.data_fpath_dict['full'])
        print(dataset.num_labels)
        print(dataset.data_per_label)
        total_num_labels += dataset.num_labels
        total_num_data += (dataset.data_per_label * dataset.num_labels)
        # check that all data_dims agree
        data_shape = consensus_check(dataset.data_shape, 'data_shape')
        data_per_label = consensus_check(dataset.data_per_label,
                                         'data_per_label')

    # hack record this
    import numpy as np
    data_dtype = np.uint8
    label_dtype = np.int32
    data = np.empty((total_num_data, ) + data_shape, dtype=data_dtype)
    labels = np.empty(total_num_labels, dtype=label_dtype)

    #def iterable_assignment():
    #    pass
    data_left = 0
    data_right = None
    labels_left = 0
    labels_right = None
    for dataset in ut.ProgressIter(dataset_list,
                                   lbl='combining datasets',
                                   freq=1):
        X_all, y_all = dataset.subset('full')
        labels_right = labels_left + y_all.shape[0]
        data_right = data_left + X_all.shape[0]
        data[data_left:data_right] = X_all
        labels[labels_left:labels_right] = y_all
        data_left = data_right
        labels_left = labels_right

    ut.save_data(data_fpath, data)
    ut.save_data(labels_fpath, labels)

    labels = ut.load_data(labels_fpath)
    num_labels = len(labels)

    merged_dataset = DataSet.new_training_set(
        alias_key=alias_key,
        data_fpath=data_fpath,
        labels_fpath=labels_fpath,
        metadata_fpath=None,
        training_dpath=training_dpath,
        data_shape=data_shape,
        data_per_label=data_per_label,
        output_dims=1,
        num_labels=num_labels,
    )
    return merged_dataset
Beispiel #7
0
def grab_liberty_siam_dataset(pairs=250000):
    """
    References:
        http://www.cs.ubc.ca/~mbrown/patchdata/patchdata.html
        https://github.com/osdf/datasets/blob/master/patchdata/dataset.py

    Notes:
        "info.txt" contains the match information Each row of info.txt
        corresponds corresponds to a separate patch, with the patches ordered
        from left to right and top to bottom in each bitmap image.

        3 types of metadata files

        info.txt - contains patch ids that correspond with the order of patches
          in the bmp images
          In the format:
              pointid, unused

        interest.txt -
            interest points corresponding to patches with patchids
            has same number of rows as info.txt
            In the format:
                reference image id, x, y, orientation, scale (in log2 units)

        m50_<d>_<d>_0.txt -
             matches files
             patchID1  3DpointID1  unused1  patchID2  3DpointID2  unused2

    CommandLine:
        python -m ibeis_cnn.ingest_data --test-grab_liberty_siam_dataset --show

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis_cnn.ingest_data import *  # NOQA
        >>> pairs = 500
        >>> dataset = grab_liberty_siam_dataset(pairs)
        >>> ut.quit_if_noshow()
        >>> from ibeis_cnn import draw_results
        >>> #ibsplugin.rrr()
        >>> flat_metadata = {}
        >>> data, labels = dataset.subset('full')
        >>> ut.quit_if_noshow()
        >>> warped_patch1_list = data[::2]
        >>> warped_patch2_list = data[1::2]
        >>> dataset.interact()
        >>> ut.show_if_requested()
    """
    datakw = {
        'detector': 'dog',
        'pairs': pairs,
    }

    assert datakw['detector'] in ['dog', 'harris']
    assert pairs in [500, 50000, 100000, 250000]

    liberty_urls = {
        'dog': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty.zip',
        'harris': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty_harris.zip',
    }
    url = liberty_urls[datakw['detector']]
    ds_path = ut.grab_zipped_url(url)

    ds_name = splitext(basename(ds_path))[0]
    alias_key = 'liberty;' + ut.dict_str(datakw, nl=False, explicit=True)
    cfgstr = ','.join([str(val) for key, val in ut.iteritems_sorted(datakw)])

    # TODO: allow a move of the base data prefix

    training_dpath = ut.ensure_app_resource_dir('ibeis_cnn', 'training',
                                                ds_name)
    if ut.get_argflag('--vtd'):
        ut.vd(training_dpath)
    ut.ensuredir(training_dpath)

    data_fpath = join(training_dpath, 'liberty_data_' + cfgstr + '.pkl')
    labels_fpath = join(training_dpath, 'liberty_labels_' + cfgstr + '.pkl')

    if not ut.checkpath(data_fpath, verbose=True):
        data, labels = ingest_helpers.extract_liberty_style_patches(
            ds_path, pairs)
        ut.save_data(data_fpath, data)
        ut.save_data(labels_fpath, labels)

    # hack for caching num_labels
    labels = ut.load_data(labels_fpath)
    num_labels = len(labels)

    dataset = DataSet.new_training_set(
        alias_key=alias_key,
        data_fpath=data_fpath,
        labels_fpath=labels_fpath,
        metadata_fpath=None,
        training_dpath=training_dpath,
        data_shape=(64, 64, 1),
        data_per_label=2,
        output_dims=1,
        num_labels=num_labels,
    )
    return dataset