Beispiel #1
0
 def subset_metadata(dataset, key='full'):
     metadata_fpath = dataset.fpath_dict[key].get('metadata', None)
     if metadata_fpath is not None:
         flat_metadata = ut.load_data(metadata_fpath, verbose=True)
     else:
         flat_metadata = None
     return flat_metadata
Beispiel #2
0
 def subset_data(dataset, key='full'):
     data_fpath = dataset.fpath_dict[key]['data']
     data = ut.load_data(data_fpath, verbose=True)
     if len(data.shape) == 3:
         # add channel dimension for implicit grayscale
         data.shape = data.shape + (1,)
     return data
    def ensure_results(self, expt_name=None, nocompute=None):
        """
        Subclasses must obey the measure_<expt_name>, draw_<expt_name> contract
        """
        if nocompute is None:
            nocompute = ut.get_argflag('--nocompute')

        if expt_name is None and exists(self.dpath):
            # Load all
            fpaths = ut.glob(str(self.dpath), '*.pkl')
            expt_names = [splitext(basename(fpath))[0] for fpath in fpaths]
            for fpath, expt_name in zip(fpaths, expt_names):
                self.expt_results[expt_name] = ut.load_data(fpath)
        else:
            # expt_name = splitext(basename(fpath))[0]
            fpath = join(str(self.dpath), expt_name + '.pkl')
            # fpath = ut.truepath(fpath)
            if not exists(fpath):
                ut.cprint(
                    'Experiment results {} do not exist'.format(expt_name),
                    'red')
                ut.cprint('First re-setup to check if it is a path issue',
                          'red')
                if nocompute:
                    raise Exception(
                        str(expt_name) + ' does not exist for ' +
                        str(self.dbname))

                if self.ibs is None:
                    self._precollect()
                ut.cprint('Checking new fpath', 'yellow')
                fpath = join(str(self.dpath), expt_name + '.pkl')
                logger.info('fpath = %r' % (fpath, ))
                if not exists(fpath):
                    ut.cprint('Results still missing need to re-measure',
                              'red')
                    # assert False
                    # self._setup()
                    getattr(self, 'measure_' + expt_name)()
                else:
                    ut.cprint('Re-setup fixed it', 'green')
            else:
                logger.info('Experiment results {} exist'.format(expt_name))
            self.expt_results[expt_name] = ut.load_data(fpath)
            return self.expt_results[expt_name]
Beispiel #4
0
 def load_latest_classifiers(infr, dpath):
     from ibeis.algo.verif import deploy
     task_clf_fpaths = deploy.Deployer(dpath).find_latest_local()
     classifiers = {}
     for task_key, fpath in task_clf_fpaths.items():
         clf_info = ut.load_data(fpath)
         assert clf_info['metadata']['task_key'] == task_key, (
             'bad saved clf at fpath={}'.format(fpath))
         classifiers[task_key] = clf_info
     infr.verifiers = classifiers
Beispiel #5
0
 def ensure(self, task_key):
     _, fname = self._make_deploy_metadata(task_key=task_key)
     fpath = join(self.dpath, fname)
     if exists(fpath):
         deploy_info = ut.load_data(fpath)
         assert bool(deploy_info['clf']), 'must have clf'
     else:
         deploy_info = self.deploy(task_key=task_key)
         assert exists(fpath), 'must now exist'
     verif = verifier.Verifier(self.pblm.infr.ibs, deploy_info=deploy_info)
     assert verif.metadata[
         'task_key'] == task_key, 'bad saved clf at fpath={}'.format(fpath)
     return verif
Beispiel #6
0
 def load(dataset):
     dataset.ensure_dirs()
     dataset.ensure_symlinked()
     if not exists(dataset.info_fpath):
         raise IOError('dataset info manifest cache miss')
     else:
         dataset._info = ut.load_data(dataset.info_fpath)
     if not exists(dataset.data_fpath):
         raise IOError('dataset data cache miss')
     dataset.load_splitsets()
     # Hack
     if not exists(dataset.fpath_dict['full']['metadata']):
         dataset.fpath_dict['full']['metadata'] = None
Beispiel #7
0
def load(data_fpath, labels_fpath=None):
    # Load X matrix (data)
    data = ut.load_data(data_fpath)
    labels = ut.load_data(labels_fpath) if labels_fpath is not None else None

    #if splitext(data_fpath)[1] == '.hdf5':
    #    data = ut.load_hdf5(data_fpath)
    #else:
    #    data = np.load(data_fpath, mmap_mode='r')
    ## Load y vector (labels)
    #labels = None
    #if labels_fpath is not None:
    #    if splitext(labels_fpath)[1] == '.hdf5':
    #        labels = ut.load_hdf5(labels_fpath)
    #    else:
    #        labels = np.load(labels_fpath, mmap_mode='r')
    ## TODO: This should be part of data preprocessing
    ## Ensure that data is 4-dimensional
    if len(data.shape) == 3:
        # add channel dimension for implicit grayscale
        data.shape = data.shape + (1, )
    # Return data
    return data, labels
def load_oxford_2013():
    """
    Found this data in README of SMK publication
    https://hal.inria.fr/hal-00864684/document
    http://people.rennes.inria.fr/Herve.Jegou/publications.html
    with download script

    CommandLine:
        # Download oxford13 data
        cd ~/work/Oxford
        mkdir -p smk_data_iccv_2013
        cd smk_data_iccv_2013
        wget -nH --cut-dirs=4 -r -Pdata/ ftp://ftp.irisa.fr/local/texmex/corpus/iccv2013/

    This dataset has 5063 images wheras 07 has 5062
    This dataset seems to contain an extra junk image:
        ashmolean_000214

    # Remember that matlab is 1 indexed!
    # DONT FORGET TO CONVERT TO 0 INDEXING!
    """
    from yael.ynumpy import fvecs_read
    from yael.yutils import load_ext
    import scipy.io
    import vtool as vt
    from os.path import join

    dbdir = ut.truepath('/raid/work/Oxford/')
    datadir = dbdir + '/smk_data_iccv_2013/data/'

    # we are not retraining, so this is unused
    # # Training data descriptors for Paris6k dataset
    # train_sift_fname = join(datadir, 'paris_sift.uint8')  # NOQA
    # # File storing visual words of Paris6k descriptors used in our ICCV paper
    # train_vw_fname = join(datadir, 'clust_preprocessed/oxford_train_vw.int32')

    # Pre-learned quantizer used in ICCV paper (used if docluster=false)
    codebook_fname = join(datadir, 'clust_preprocessed/oxford_codebook.fvecs')

    # Files storing descriptors/geometry for Oxford5k dataset
    test_sift_fname = join(datadir, 'oxford_sift.uint8')
    test_geom_fname = join(datadir, 'oxford_geom_sift.float')
    test_nf_fname = join(datadir, 'oxford_nsift.uint32')

    # File storing visual words of Oxford5k descriptors used in our ICCV paper
    test_vw_fname = join(datadir, 'clust_preprocessed/oxford_vw.int32')
    # Ground-truth for Oxford dataset
    gnd_fname = join(datadir, 'gnd_oxford.mat')

    oxford_vecs = load_ext(test_sift_fname, ndims=128, verbose=True)
    oxford_nfeats = load_ext(test_nf_fname, verbose=True)
    oxford_words = fvecs_read(codebook_fname)
    oxford_wids = load_ext(test_vw_fname, verbose=True) - 1

    test_geom_invV_fname = test_geom_fname + '.invV.pkl'
    try:
        all_kpts = ut.load_data(test_geom_invV_fname)
        logger.info('loaded invV keypoints')
    except IOError:
        oxford_kptsZ = load_ext(test_geom_fname, ndims=5, verbose=True)
        logger.info('converting to invV keypoints')
        all_kpts = vt.convert_kptsZ_to_kpts(oxford_kptsZ)
        ut.save_data(test_geom_invV_fname, all_kpts)

    gnd_ox = scipy.io.loadmat(gnd_fname)
    imlist = [x[0][0] for x in gnd_ox['imlist']]
    qx_to_dx = gnd_ox['qidx'] - 1

    data_uri_order = imlist
    query_uri_order = ut.take(data_uri_order, qx_to_dx)

    offset_list = np.hstack(([0], oxford_nfeats.cumsum())).astype(np.int64)

    # query_gnd = gnd_ox['gnd'][0][0]
    # bboxes = query_gnd[0]
    # qx_to_ok_gtidxs1 = [x[0] for x in query_gnd[1][0]]
    # qx_to_junk_gtidxs2 = [x[0] for x in query_gnd[2][0]]
    # # ut.depth_profile(qx_to_gtidxs1)
    # # ut.depth_profile(qx_to_gtidxs2)

    assert sum(oxford_nfeats) == len(oxford_vecs)
    assert offset_list[-1] == len(oxford_vecs)
    assert len(oxford_wids) == len(oxford_vecs)
    assert oxford_wids.max() == len(oxford_words) - 1

    data = {
        'offset_list': offset_list,
        'all_kpts': all_kpts,
        'all_vecs': oxford_vecs,
        'words': oxford_words,
        'idx_to_wx': oxford_wids,
        'data_uri_order': data_uri_order,
        'query_uri_order': query_uri_order,
    }
    return data
def load_oxford_2007():
    """
    Loads data from
    http://www.robots.ox.ac.uk:5000/~vgg/publications/2007/Philbin07/philbin07.pdf

    >>> from wbia.algo.smk.script_smk import *  # NOQA
    """
    from os.path import join, basename, splitext
    import pandas as pd
    import vtool as vt

    dbdir = ut.truepath('/raid/work/Oxford/')
    data_fpath0 = join(dbdir, 'data_2007.pkl')

    if ut.checkpath(data_fpath0):
        data = ut.load_data(data_fpath0)
        return data
    else:
        word_dpath = join(dbdir, 'word_oxc1_hesaff_sift_16M_1M')
        _word_fpath_list = ut.ls(word_dpath)
        imgid_to_word_fpath = {
            splitext(basename(word_fpath))[0]: word_fpath
            for word_fpath in _word_fpath_list
        }
        readme_fpath = join(dbdir, 'README2.txt')
        imgid_order = ut.readfrom(readme_fpath).split('\n')[20:-1]

        imgid_order = imgid_order
        data_uri_order = [x.replace('oxc1_', '') for x in imgid_order]

        imgid_to_df = {}
        for imgid in ut.ProgIter(imgid_order, label='reading kpts'):
            word_fpath = imgid_to_word_fpath[imgid]
            row_gen = (map(float,
                           line.strip('\n').split(' '))
                       for line in ut.read_lines_from(word_fpath)[2:])
            rows = [(int(word_id), x, y, e11, e12, e22)
                    for (word_id, x, y, e11, e12, e22) in row_gen]
            df = pd.DataFrame(
                rows, columns=['word_id', 'x', 'y', 'e11', 'e12', 'e22'])
            imgid_to_df[imgid] = df

        df_list = ut.take(imgid_to_df, imgid_order)

        nfeat_list = [len(df_) for df_ in df_list]
        offset_list = [0] + ut.cumsum(nfeat_list)
        shape = (offset_list[-1], 128)
        # shape = (16334970, 128)
        sift_fpath = join(dbdir, 'OxfordSIFTDescriptors',
                          'feat_oxc1_hesaff_sift.bin')
        try:
            file_ = open(sift_fpath, 'rb')
            with ut.Timer('Reading SIFT binary file'):
                nbytes = np.prod(shape)
                all_vecs = np.fromstring(file_.read(nbytes), dtype=np.uint8)
            all_vecs = all_vecs.reshape(shape)
        finally:
            file_.close()

        kpts_list = [
            df_.loc[:, ('x', 'y', 'e11', 'e12', 'e22')].values
            for df_ in df_list
        ]
        wordid_list = [df_.loc[:, 'word_id'].values for df_ in df_list]
        kpts_Z = np.vstack(kpts_list)
        idx_to_wx = np.hstack(wordid_list)

        # assert len(np.unique(idx_to_wx)) == 1E6

        # Reqd standard query order
        query_files = sorted(
            ut.glob(dbdir + '/oxford_groundtruth', '*_query.txt'))
        query_uri_order = []
        for qpath in query_files:
            text = ut.readfrom(qpath, verbose=0)
            query_uri = text.split(' ')[0].replace('oxc1_', '')
            query_uri_order.append(query_uri)

        logger.info('converting to invV')
        all_kpts = vt.convert_kptsZ_to_kpts(kpts_Z)

        data = {
            'offset_list': offset_list,
            'all_kpts': all_kpts,
            'all_vecs': all_vecs,
            'idx_to_wx': idx_to_wx,
            'data_uri_order': data_uri_order,
            'query_uri_order': query_uri_order,
        }
        ut.save_data(data_fpath0, data)
    return data
Beispiel #10
0
    def _make_verifier(self, ibs, deploy_fpath, task_key):
        """
        Ignore:
            # py3 side
            clf = deploy_info['clf']
            a = clf.estimators_[0]
            b = a.tree_
            ut.save_data('_tree.pkl', b)
            c = b.__getstate__()
            d = c['nodes']
            ut.save_data('_nodes.pkl', d)

            a.estimators_[0].tree_.__getstate__()['nodes']


        Ignore:
            # py2 side
            ut.load_data('_tree.pkl')
            ut.load_data('_nodes.pkl')

            >>> from wbia.algo.verif.vsone import *  # NOQA
            >>> params = dict(sample_method='random')
            >>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params)
            >>> pblm.setup(with_simple=False)
            >>> task_key = pblm.primary_task_key
            >>> self = Deployer(dpath='.', pblm=pblm)
            >>> deploy_info = self.deploy()

            a = deploy_info['clf']
            d = a.estimators_[0].tree_.__getstate__()['nodes']


        Ignore:
            I'm having a similar issue when trying to use python2 to load a
            sklearn RandomForestClassifier that I saved in python3. I created a
            MWE.

            In python 3

                import numpy as np
                import pickle
                data = np.array(
                    [( 1, 26, 69,   5.32214928e+00,  0.69562945, 563,  908.,  1),
                     ( 2,  7, 62,   1.74883020e+00,  0.33854101, 483,  780.,  1),
                     (-1, -1, -2,  -2.00000000e+00,  0.76420451,   7,    9., -2),
                     (-1, -1, -2,  -2.00000000e+00,  0.        ,  62,  106., -2)],
                  dtype=[('left_child', '<i8'), ('right_child', '<i8'),
                  ('feature', '<i8'), ('threshold', '<f8'), ('impurity',
                  '<f8'), ('n_node_samples', '<i8'),
                  ('weighted_n_node_samples', '<f8'), ('missing_direction',
                  '<i8')])

                # Save using pickle
                with open('data.pkl', 'wb') as file_:
                    # Use protocol 2 to support python2 and 3
                    pickle.dump(data, file_, protocol=2)

                # Save with numpy directly
                np.save('data.npy', data)

            Then in python 2
                # Load with pickle
                import pickle
                with open('data.pkl', 'rb') as file_:
                    data = pickle.load(file_)
                # This results in `ValueError: non-string names in Numpy dtype unpickling`

                # Load with numpy directly
                data = np.load('data.npy')
                # This works

            However this still doesn't make sklearn play nice between 2 and 3.
            So, how can we get pickle to load this numpy object correctly?
            Here is the fix suggested in the link:

                from lib2to3.fixes.fix_imports import MAPPING
                import sys
                import pickle

                # MAPPING maps Python 2 names to Python 3 names. We want this in reverse.
                REVERSE_MAPPING = {}
                for key, val in MAPPING.items():
                    REVERSE_MAPPING[val] = key

                # We can override the Unpickler and loads
                class Python_3_Unpickler(pickle.Unpickler):
                    def find_class(self, module, name):
                        if module in REVERSE_MAPPING:
                            module = REVERSE_MAPPING[module]
                        __import__(module)
                        mod = sys.modules[module]
                        klass = getattr(mod, name)
                        return klass

                with open('data.pkl', 'rb') as file_:
                    data = Python_3_Unpickler(file_).load()

            This still doesn't work



            https://stackoverflow.com/questions/41720952/unpickle-sklearn-tree-descisiontreeregressor-in-python-2-from-python3

        """
        deploy_info = ut.load_data(deploy_fpath)
        verif = verifier.Verifier(ibs, deploy_info=deploy_info)
        if task_key is not None:
            assert (verif.metadata['task_key'] == task_key
                    ), 'bad saved clf at fpath={}'.format(deploy_fpath)
        return verif
Beispiel #11
0
 def subset_labels(dataset, key='full'):
     labels_fpath = dataset.fpath_dict[key]['labels']
     labels = (None if labels_fpath is None
               else ut.load_data(labels_fpath, verbose=True))
     return labels
Beispiel #12
0
def merge_datasets(dataset_list):
    """
    Merges a list of dataset objects into a single combined dataset.
    """
    def consensus_check_factory():
        """
        Returns a temporary function used to check that all incoming values
        with the same key are consistent
        """
        from collections import defaultdict
        past_values = defaultdict(lambda: None)

        def consensus_check(value, key):
            assert past_values[key] is None or past_values[key] == value, (
                'key=%r with value=%r does not agree with past_value=%r' %
                (key, value, past_values[key]))
            past_values[key] = value
            return value

        return consensus_check

    total_num_labels = 0
    total_num_data = 0

    input_alias_list = [dataset.alias_key for dataset in dataset_list]

    alias_key = 'combo_' + ut.hashstr27(repr(input_alias_list), hashlen=8)
    training_dpath = ut.ensure_app_resource_dir('ibeis_cnn', 'training',
                                                alias_key)
    data_fpath = ut.unixjoin(training_dpath, alias_key + '_data.hdf5')
    labels_fpath = ut.unixjoin(training_dpath, alias_key + '_labels.hdf5')

    try:
        # Try and short circut cached loading
        merged_dataset = DataSet.from_alias_key(alias_key)
        return merged_dataset
    except (Exception, AssertionError) as ex:
        ut.printex(ex,
                   'alias definitions have changed. alias_key=%r' %
                   (alias_key, ),
                   iswarning=True)

    # Build the dataset
    consensus_check = consensus_check_factory()

    for dataset in dataset_list:
        print(ut.get_file_nBytes_str(dataset.data_fpath))
        print(dataset.data_fpath_dict['full'])
        print(dataset.num_labels)
        print(dataset.data_per_label)
        total_num_labels += dataset.num_labels
        total_num_data += (dataset.data_per_label * dataset.num_labels)
        # check that all data_dims agree
        data_shape = consensus_check(dataset.data_shape, 'data_shape')
        data_per_label = consensus_check(dataset.data_per_label,
                                         'data_per_label')

    # hack record this
    import numpy as np
    data_dtype = np.uint8
    label_dtype = np.int32
    data = np.empty((total_num_data, ) + data_shape, dtype=data_dtype)
    labels = np.empty(total_num_labels, dtype=label_dtype)

    #def iterable_assignment():
    #    pass
    data_left = 0
    data_right = None
    labels_left = 0
    labels_right = None
    for dataset in ut.ProgressIter(dataset_list,
                                   lbl='combining datasets',
                                   freq=1):
        X_all, y_all = dataset.subset('full')
        labels_right = labels_left + y_all.shape[0]
        data_right = data_left + X_all.shape[0]
        data[data_left:data_right] = X_all
        labels[labels_left:labels_right] = y_all
        data_left = data_right
        labels_left = labels_right

    ut.save_data(data_fpath, data)
    ut.save_data(labels_fpath, labels)

    labels = ut.load_data(labels_fpath)
    num_labels = len(labels)

    merged_dataset = DataSet.new_training_set(
        alias_key=alias_key,
        data_fpath=data_fpath,
        labels_fpath=labels_fpath,
        metadata_fpath=None,
        training_dpath=training_dpath,
        data_shape=data_shape,
        data_per_label=data_per_label,
        output_dims=1,
        num_labels=num_labels,
    )
    return merged_dataset
Beispiel #13
0
def get_ibeis_part_siam_dataset(**kwargs):
    """
    PARTS based network data

    CommandLine:
        python -m ibeis_cnn.ingest_data --test-get_ibeis_part_siam_dataset --show
        python -m ibeis_cnn.ingest_data --test-get_ibeis_part_siam_dataset --show --db PZ_Master1 --acfg_name timectrl
        python -m ibeis_cnn.ingest_data --test-get_ibeis_part_siam_dataset --show --db PZ_MTEST --acfg_name unctrl --dryrun

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis_cnn.ingest_data import *  # NOQA
        >>> from ibeis_cnn import draw_results
        >>> import ibeis
        >>> kwargs = {}  # ut.argparse_dict({'max_examples': None, 'num_top': 3})
        >>> dataset = get_ibeis_part_siam_dataset(**kwargs)
        >>> ut.quit_if_noshow()
        >>> dataset.interact(ibs=dataset.getprop('ibs'))
        >>> ut.show_if_requested()
    """
    import ibeis
    datakw = ut.argparse_dict(
        {
            'colorspace': 'gray',
            'acfg_name': 'ctrl',
            #'db': None,
            'db': 'PZ_MTEST',
        },
        alias_dict={'acfg_name': ['acfg']},
        verbose=True)

    datakw.update(kwargs)
    print('\n\n[get_ibeis_part_siam_dataset] START')

    alias_key = ut.dict_str(datakw, nl=False, explicit=True)

    dbname = datakw.pop('db')

    try:
        if NOCACHE_DATASET:
            raise Exception('forced cache off')
        # Try and short circut cached loading
        dataset = DataSet.from_alias_key(alias_key)
        dataset.setprop('ibs', lambda: ibeis.opendb(db=dbname))
        return dataset
    except Exception as ex:
        ut.printex(ex,
                   'alias definitions have changed. alias_key=%r' %
                   (alias_key, ),
                   iswarning=True)

    with ut.Indenter('[LOAD IBEIS DB]'):
        ibs = ibeis.opendb(db=dbname)

    # Nets dir is the root dir for all training on this data
    training_dpath = ibs.get_neuralnet_dir()
    ut.ensuredir(training_dpath)

    with ut.Indenter('[BuildDS]'):
        # Get training data pairs
        colorspace = datakw.pop('colorspace')
        (aid_pairs, label_list,
         flat_metadata) = ingest_ibeis.get_aidpairs_partmatch(ibs, **datakw)
        # Extract and cache the data, labels, and metadata
        if ut.get_argflag('--dryrun'):
            print('exiting due to dry run')
            import sys
            sys.exit(0)
        tup = ingest_ibeis.cached_part_match_training_data_fpaths(
            ibs, aid_pairs, label_list, flat_metadata, colorspace=colorspace)
        data_fpath, labels_fpath, metadata_fpath, training_dpath, data_shape = tup
        print('\n[get_ibeis_part_siam_dataset] FINISH\n\n')

    # hack for caching num_labels
    labels = ut.load_data(labels_fpath)
    num_labels = len(labels)

    dataset = DataSet.new_training_set(
        alias_key=alias_key,
        data_fpath=data_fpath,
        labels_fpath=labels_fpath,
        metadata_fpath=metadata_fpath,
        training_dpath=training_dpath,
        data_shape=data_shape,
        data_per_label=2,
        output_dims=1,
        num_labels=num_labels,
    )
    dataset.setprop('ibs', ibs)
    return dataset
Beispiel #14
0
def get_ibeis_patch_siam_dataset(**kwargs):
    """
    CommandLine:
        python -m ibeis_cnn.ingest_data --test-get_ibeis_patch_siam_dataset --show
        python -m ibeis_cnn.ingest_data --test-get_ibeis_patch_siam_dataset --show --db PZ_Master1 --acfg_name default
        python -m ibeis_cnn.ingest_data --test-get_ibeis_patch_siam_dataset --show --db PZ_Master1 --acfg_name timectrl
        python -m ibeis_cnn.ingest_data --test-get_ibeis_patch_siam_dataset --show --db PZ_MTEST --acfg_name unctrl --dryrun

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis_cnn.ingest_data import *  # NOQA
        >>> from ibeis_cnn import draw_results
        >>> import ibeis
        >>> kwargs = {}  # ut.argparse_dict({'max_examples': None, 'num_top': 3})
        >>> dataset = get_ibeis_patch_siam_dataset(**kwargs)
        >>> ut.quit_if_noshow()
        >>> dataset.interact()
        >>> ut.show_if_requested()
    """
    datakw = ut.argparse_dict(
        {
            #'db': 'PZ_MTEST',
            'max_examples': None,
            #'num_top': 3,
            'num_top': None,
            'min_featweight': .8 if not ut.WIN32 else None,
            'controlled': True,
            'colorspace': 'gray',
            'acfg_name': None,
        },
        alias_dict={'acfg_name': ['acfg', 'a']},
        verbose=True)

    datakw.update(kwargs)

    #ut.get_func_kwargs(ingest_ibeis.get_aidpairs_and_matches)

    if datakw['acfg_name'] is not None:
        del datakw['controlled']
    if datakw['max_examples'] is None:
        del datakw['max_examples']
    if datakw['num_top'] is None:
        del datakw['num_top']

    with ut.Indenter('[LOAD IBEIS DB]'):
        import ibeis
        dbname = ut.get_argval('--db', default='PZ_MTEST')
        ibs = ibeis.opendb(dbname=dbname, defaultdb='PZ_MTEST')

    # Nets dir is the root dir for all training on this data
    training_dpath = ibs.get_neuralnet_dir()
    ut.ensuredir(training_dpath)
    print('\n\n[get_ibeis_patch_siam_dataset] START')
    #log_dir = join(training_dpath, 'logs')
    #ut.start_logging(log_dir=log_dir)

    alias_key = ibs.get_dbname() + ';' + ut.dict_str(
        datakw, nl=False, explicit=True)
    try:
        if NOCACHE_DATASET:
            raise Exception('forced cache off')
        # Try and short circut cached loading
        dataset = DataSet.from_alias_key(alias_key)
        dataset.setprop('ibs', lambda: ibeis.opendb(db=dbname))
        return dataset
    except Exception as ex:
        ut.printex(ex,
                   'alias definitions have changed. alias_key=%r' %
                   (alias_key, ),
                   iswarning=True)

    with ut.Indenter('[BuildDS]'):
        # Get training data pairs
        colorspace = datakw.pop('colorspace')
        patchmatch_tup = ingest_ibeis.get_aidpairs_and_matches(ibs, **datakw)
        aid1_list, aid2_list, kpts1_m_list, kpts2_m_list, fm_list, metadata_lists = patchmatch_tup
        # Extract and cache the data
        # TODO: metadata
        if ut.get_argflag('--dryrun'):
            print('exiting due to dry run')
            import sys
            sys.exit(0)
        tup = ingest_ibeis.cached_patchmetric_training_data_fpaths(
            ibs,
            aid1_list,
            aid2_list,
            kpts1_m_list,
            kpts2_m_list,
            fm_list,
            metadata_lists,
            colorspace=colorspace)
        data_fpath, labels_fpath, metadata_fpath, training_dpath, data_shape = tup
        print('\n[get_ibeis_patch_siam_dataset] FINISH\n\n')

    # hack for caching num_labels
    labels = ut.load_data(labels_fpath)
    num_labels = len(labels)

    dataset = DataSet.new_training_set(
        alias_key=alias_key,
        data_fpath=data_fpath,
        labels_fpath=labels_fpath,
        metadata_fpath=metadata_fpath,
        training_dpath=training_dpath,
        data_shape=data_shape,
        data_per_label=2,
        output_dims=1,
        num_labels=num_labels,
    )
    dataset.setprop('ibs', ibs)
    return dataset
Beispiel #15
0
def grab_liberty_siam_dataset(pairs=250000):
    """
    References:
        http://www.cs.ubc.ca/~mbrown/patchdata/patchdata.html
        https://github.com/osdf/datasets/blob/master/patchdata/dataset.py

    Notes:
        "info.txt" contains the match information Each row of info.txt
        corresponds corresponds to a separate patch, with the patches ordered
        from left to right and top to bottom in each bitmap image.

        3 types of metadata files

        info.txt - contains patch ids that correspond with the order of patches
          in the bmp images
          In the format:
              pointid, unused

        interest.txt -
            interest points corresponding to patches with patchids
            has same number of rows as info.txt
            In the format:
                reference image id, x, y, orientation, scale (in log2 units)

        m50_<d>_<d>_0.txt -
             matches files
             patchID1  3DpointID1  unused1  patchID2  3DpointID2  unused2

    CommandLine:
        python -m ibeis_cnn.ingest_data --test-grab_liberty_siam_dataset --show

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis_cnn.ingest_data import *  # NOQA
        >>> pairs = 500
        >>> dataset = grab_liberty_siam_dataset(pairs)
        >>> ut.quit_if_noshow()
        >>> from ibeis_cnn import draw_results
        >>> #ibsplugin.rrr()
        >>> flat_metadata = {}
        >>> data, labels = dataset.subset('full')
        >>> ut.quit_if_noshow()
        >>> warped_patch1_list = data[::2]
        >>> warped_patch2_list = data[1::2]
        >>> dataset.interact()
        >>> ut.show_if_requested()
    """
    datakw = {
        'detector': 'dog',
        'pairs': pairs,
    }

    assert datakw['detector'] in ['dog', 'harris']
    assert pairs in [500, 50000, 100000, 250000]

    liberty_urls = {
        'dog': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty.zip',
        'harris': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty_harris.zip',
    }
    url = liberty_urls[datakw['detector']]
    ds_path = ut.grab_zipped_url(url)

    ds_name = splitext(basename(ds_path))[0]
    alias_key = 'liberty;' + ut.dict_str(datakw, nl=False, explicit=True)
    cfgstr = ','.join([str(val) for key, val in ut.iteritems_sorted(datakw)])

    # TODO: allow a move of the base data prefix

    training_dpath = ut.ensure_app_resource_dir('ibeis_cnn', 'training',
                                                ds_name)
    if ut.get_argflag('--vtd'):
        ut.vd(training_dpath)
    ut.ensuredir(training_dpath)

    data_fpath = join(training_dpath, 'liberty_data_' + cfgstr + '.pkl')
    labels_fpath = join(training_dpath, 'liberty_labels_' + cfgstr + '.pkl')

    if not ut.checkpath(data_fpath, verbose=True):
        data, labels = ingest_helpers.extract_liberty_style_patches(
            ds_path, pairs)
        ut.save_data(data_fpath, data)
        ut.save_data(labels_fpath, labels)

    # hack for caching num_labels
    labels = ut.load_data(labels_fpath)
    num_labels = len(labels)

    dataset = DataSet.new_training_set(
        alias_key=alias_key,
        data_fpath=data_fpath,
        labels_fpath=labels_fpath,
        metadata_fpath=None,
        training_dpath=training_dpath,
        data_shape=(64, 64, 1),
        data_per_label=2,
        output_dims=1,
        num_labels=num_labels,
    )
    return dataset