Beispiel #1
0
def run(args):
    print '> compute tv space'
    files, _ = pc.getFiles(args.inputfolder,
                           args.suffix,
                           args.labelfile,
                           exact=args.exact)
    ubm = ubm_adaption.loadGMM(args.load_ubm)

    widgets = [
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ]
    progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files))
    print 'extract stats'

    def extract(i):
        descr = pc.loadDescriptors(files[i])
        of = os.path.join(
            args.outputfolder,
            os.path.basename(files[i]).split('.', 1)[0] + '_stat.pkl.gz')
        if args.load_stats and os.path.exists(of):
            N, F = pc.load(of)
        else:
            N, F = compute_bw_stats.compute_bw_stats(descr, ubm, None,
                                                     args.nbest)
            pc.dump(of, [N, F], verbose=False)
        if i == 0:
            print N.shape, F.shape
        progress.update(i + 1)
        return N.reshape(1, -1), F.reshape(1, -1)

    progress.start()
    if args.parallel:
        Ns, Fs = zip(
            *pc.parmap(extract, range(len(files)), nprocs=args.nprocs))
    else:
        Ns, Fs = zip(*map(extract, range(len(files))))
    progress.finish()

    Ns = np.concatenate(Ns, axis=0)
    Fs = np.concatenate(Fs, axis=0)
    print 'train tv from {} stats'.format(len(Ns))
    tv = train_tv_space(Ns, Fs, ubm, args.tv_dim, args.tv_niter, args.parallel,
                        args.nprocs)

    folder = os.path.join(args.outputfolder, 'tv.pkl.gz')
    pc.dump(folder, tv)

    return folder

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Clustering - Index')
    parser = pc.commonArguments(parser)
    parser = addArguments(parser)
    args = parser.parse_args()
    np.random.seed(42)

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    print args.max_descriptors

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                concat=True)
    print 'n-files:', len(files)

    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)

    desc_files, _ = pc.getFiles(args.df, args.ds, args.labelfile, concat=True)

    kmeans = pc.load(args.cluster)
    means = kmeans.means_

    print files[0], desc_files[0]
    dummy_desc = pc.loadDescriptors(files[0])
    dummy_desc2 = pc.loadDescriptors(desc_files[0])
    assert (dummy_desc.shape[0] == dummy_desc2.shape[0])
Beispiel #3
0

def runNN(descriptors, labels, parallel, nprocs):
    """
    compute nearest neighbor from specific descriptors, given labels
    """

    distance_method = {"cosine": 'cosine'}
    ret_matrix = None
    for name, method in distance_method.iteritems():
        dist_matrix = computeDistances(descriptors, method, parallel, nprocs)

        computeStats(name, dist_matrix, labels, parallel)
        ret_matrix = dist_matrix

    return ret_matrix


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Evaluate stuff")
    parser = pc.commonArguments(parser)
    args = parser.parse_args()

    descr_files, labels = pc.getFiles(args.inputfolder,
                                      args.suffix,
                                      args.labelfile,
                                      exact=True)
    descriptors = pc.loadDescriptors(descr_files)

    ret_matrix = runNN(descriptors, labels, args.parallel)
Beispiel #4
0
def run(args):
    print '> compute LCS'
    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                exact=args.exact)
    if len(args.max_descriptors) == 0:
        descriptors, index_list = pc.loadDescriptors(files,
                                                     rand=True,
                                                     return_index_list=1)
    else:
        descriptors, index_list = pc.loadDescriptors(files,\
                                         max_descs=args.lcs_max_descriptors,
                                         max_descs_per_file=max(int(args.lcs_max_descriptors/len(files)),\
                                                                1),
                                         rand=True,
                                        return_index_list=1)
        print 'descriptors.shape', descriptors.shape
#        #if not args.inputfolders:
#        cur_data, index_list = pc.loadDescriptors(files,
#                                                  max_descs=args.max_descriptors[0]\
#                                                  if args.max_descriptors\
#                                                  else 0,
#                                                  return_index_list=True)

# per descriptor labels:
    if len(index_list) - 1 != len(labels):
        raise ValueError('{} != {} + 1'.format(len(index_list), len(labels)))
    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)
    desc_labels = np.zeros(len(descriptors), dtype=np.uint32)
    for r in xrange(len(labels)):
        desc_labels[index_list[r]:index_list[r + 1]] = labels[r]

    prep = preprocess.Preprocess(args)

    ubm = ubm_adaption.loadGMM(args.load_ubm)
    if not args.no_assignment:
        assignments = encoding.getAssignment(ubm.means_, descriptors)
    lcs = []
    descr = []
    # Note: we could also compute the LCS afterwards using 'multipca' option
    # of preprocess...
    for i in range(len(ubm.means_)):
        if args.no_assignment:
            diff = descriptors - ubm.means_[i]
        else:
            for_lcs = descriptors[assignments[:, i] > 0]
            diff = for_lcs - ubm.means_[i]
        if args.resnorm:
            diff = preprocessing.normalize(diff, norm='l2', copy=False)
        if not args.global_cs:
            prep.fit(diff, desc_labels[assignments[:, i] > 0])
            lcs.append(copy.deepcopy(prep.pca))
            prep.pca = None
        else:
            descr.append(diff)

    if args.global_cs:
        print '> compute global lcs'
        diff = np.concatenate(descr, axis=1)
        print '... from descr.shape', diff.shape
        prep.fit(diff, desc_labels)
        print '< compute global lcs'
        lcs = copy.deepcopy(prep.pca)
        prep.pca = None
    folder = os.path.join(args.outputfolder, 'lcs.pkl.gz')
    pc.dump(folder, lcs)
    return folder
Beispiel #5
0
    print "NN {:10} TOP-1: {:7}  mAP: {:12}".format(name, top1, mAP)
    
    return top1, mAP

def runNN(descriptors, labels, parallel, nprocs):
    """
    compute nearest neighbor from specific descriptors, given labels
    """

    distance_method = { "cosine": 'cosine' }
    ret_matrix = None
    for name, method in distance_method.iteritems():
        dist_matrix = computeDistances(descriptors, method, 
                                           parallel, nprocs)

        computeStats(name, dist_matrix, labels, parallel)
        ret_matrix = dist_matrix

    return ret_matrix 

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Evaluate stuff")
    parser = pc.commonArguments(parser)
    args = parser.parse_args()

    descr_files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile,
                                 exact=True)
    descriptors = pc.loadDescriptors(descr_files)

    ret_matrix = runNN( descriptors, labels, args.parallel )
Beispiel #6
0
def run(args, prep=None, identifier=''):
    if prep == None:
        prep = preprocess.Preprocess()
    if args.dist_matrix:
        files, labels = pc.getFiles(args.inputfolder,
                                    args.suffix,
                                    labelfile=args.labelfile,
                                    exact=True)
        dist_matrix = np.loadtxt(args.dist_matrix,
                                 delimiter=',',
                                 ndmin=2,
                                 dtype=np.float64)
        stats_d1 = computeStats('cosine',
                                dist_matrix,
                                labels,
                                parallel=args.parallel,
                                distance=True,
                                nprocs=args.nprocs,
                                eval_method=args.eval_method)
        if args.outputfolder:
            write_stats(os.path.join(args.outputfolder, args.stats_filename),
                        stats_d1, args.identifier)
        if args.dist_matrix2:
            dist_matrix = np.loadtxt(args.dist_matrix2,
                                     delimiter=',',
                                     ndmin=2,
                                     dtype=np.float64)
            stats_d2 = computeStats('cosine',
                                    dist_matrix,
                                    labels,
                                    parallel=args.parallel,
                                    distance=True,
                                    nprocs=args.nprocs,
                                    eval_method=args.eval_method)
            # make p-test
            from scipy import stats
            s1 = stats_d1['ap']
            s2 = stats_d2['ap']
            T, p = stats.wilcoxon(s1, s2)
            print 'wilcox T:', T
            print 'wilcox p:', p

            k, p = stats.normaltest(s1)
            print 'normaltest1 k:', k
            print 'normaltest1 p:', p
            k, p = stats.normaltest(s2)
            print 'normaltest2 k:', k
            print 'normaltest2 p:', p

            f, p = stats.f_oneway(s1, s2)
            print 'anova:', f, p

            h, p = stats.kruskal(s1, s2)
            print 'kruskal h, p', h, p

            print 'pearson:', stats.pearsonr(s1, s2)

            t, p = stats.ttest_ind(s1, s2)
            print 't-test:', t, p

            t, p = stats.ttest_ind(s1, s2, equal_var=False)
            print 't-test (false):', t, p

            def exact_mc_perm_test(xs, ys, nmc):
                n, k = len(xs), 0.0
                diff = np.abs(np.mean(xs) - np.mean(ys))
                zs = np.concatenate([xs, ys])
                for j in range(nmc):
                    np.random.shuffle(zs)
                    k += diff < np.abs(np.mean(zs[:n]) - np.mean(zs[n:]))
                return k / float(nmc)

            print 'try exact monte carlo permutation test'
            p = exact_mc_perm_test(stats_d1['ap'], stats_d2['ap'], 10000)
            print 'exact mc:', p

            def permutation_resampling(case, control, num_samples, statistic):
                """Returns p-value that statistic for case is different
                from statistc for control."""

                observed_diff = abs(statistic(case) - statistic(control))
                num_case = len(case)

                combined = np.concatenate([case, control])
                diffs = []
                for i in range(num_samples):
                    xs = np.random.permutation(combined)
                    diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])
                    diffs.append(diff)

                pval = (np.sum(diffs > observed_diff) +
                        np.sum(diffs < -observed_diff)) / float(num_samples)
                #return pval, observed_diff, diffs
                return pval

            print 'permutation test', permutation_resampling(
                s1, s2, 10000, np.mean)

        sys.exit(0)
Beispiel #7
0
                    xs = np.random.permutation(combined)
                    diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])
                    diffs.append(diff)

                pval = (np.sum(diffs > observed_diff) +
                        np.sum(diffs < -observed_diff)) / float(num_samples)
                #return pval, observed_diff, diffs
                return pval

            print 'permutation test', permutation_resampling(
                s1, s2, 10000, np.mean)

        sys.exit(0)

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                labelfile=args.labelfile,
                                inputfolders_suffix=args.inputfolders_suffix)
    if args.fusion == 'early':
        descriptors = [pc.loadDescriptors(files)]
        print 'loaded descriptor(s), shape:', descriptors[0].shape
    else:
        raise ValueError('currently no other fusion than <early> allowed!')

    # concatenate all possible features
#    if len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
#
#        descriptors, labels, all_files = pc.loadAllDescriptors(args.inputfolder,
#                                                    args.inputfolders_suffix,
#                                                    args.suffix, args.labelfile,
#                                                    1 if args.fusion == 'early' else None)
# TODO: this is unlogic: should be args.labelfile_gallery ...
def runHelper(prep, args):

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                labelfile=args.labelfile, exact=args.exact,
                                inputfolders_suffix=args.inputfolders_suffix,
                               max_files=args.max_files)
    print 'process {} files'.format(len(files))
    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]


    if args.load_all_features:
        cur_data, index_list = pc.loadDescriptors(files,
                                                  max_descs=args.max_descriptors[0]\
                                                  if args.max_descriptors\
                                                  else 0,
                                                  return_index_list=True)

        # per descriptor labels:
        if len(index_list)-1 != len(labels):
            raise ValueError('{} != {} + 1'.format(len(index_list),
                                                   len(labels)))
        le = preprocessing.LabelEncoder()
        labels = le.fit_transform(labels)
        desc_labels = np.zeros( len(cur_data), dtype=np.uint32)
        for r in xrange(len(labels)):
            desc_labels[index_list[r]:index_list[r+1]] = labels[r]

        print 'loaded all', cur_data.shape
        if 'transform' in args.mode and args.mode != 'fit_transform':
            print 'first feature before:', cur_data[0]
            print 'dimension before:', cur_data.shape[1], cur_data.dtype
            cur_data = prep.transform(cur_data)
            print 'first feature after:', cur_data[0]
            print 'dimension after:', cur_data.shape[1], cur_data.dtype

        if 'fit' in args.mode:
            if 'transform' in args.mode and args.strip_aug:
                prep.strip_aug = False
            prep.fit(cur_data, labels=desc_labels)

            if args.mode == 'fit_transform':
                cur_data = prep.transform(cur_data)

    else:
        progress = progressbar.ProgressBar(widgets=widgets,
                                       maxval=len(files))

        if any(isinstance(f, tuple) for f in files):
            files1 = [f for f in zip(*files)[0]]
            cp = os.path.commonprefix(files1)
        else:
            cp = os.path.commonprefix(files)

        def proj(i):
            # n_samples x n_features
            if not isinstance(args.inputfolder, basestring) and \
               len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
                cur_data = pc.loadMultipleDescriptors(files[i])
                if i == 0:
                    print 'loaded descs of', files[i]
                    print 'shape:', cur_data.shape
            else:
               cur_data = pc.loadDescriptors(files[i])

            if args.mode == 'fit':
                prep.partial_fit(cur_data)
                progress.update(i+1)
                return

            else:
                if i == 0:
                    print 'before:'
                    print cur_data[0]
                    print cur_data.shape, cur_data.dtype

                cur_data = prep.transform(cur_data)

                if i == 0:
                    print 'after:'
                    print cur_data[0,0:min(128,cur_data.shape[1])]
                    print cur_data.shape, cur_data.dtype

            fname = files[i] if isinstance(files[i], basestring)\
                    else files[i][0]

            if os.path.isdir(cp):
                fname = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                name = fname.replace('.pkl.gz','')
            else:
                name = os.path.splitext(fname)[0]

            if os.path.isdir(cp):
                pc.mkdir_p(os.path.join(args.outputfolder,
                    os.path.dirname(name)), silent=True)

            name = os.path.join(args.outputfolder, name  + '_pr.pkl.gz')
#            print fname, '-->', name
            with gzip.open(name, 'wb') as F:
                cPickle.dump(cur_data, F, -1)
            progress.update(i+1)

        progress.start()
        # FIXME: np.dot (e.g. used for (R)PCA) doesnt work in parallel atm
#        if args.parallel:
#            pc.parmap(proj, range(len(files)), args.nprocs)
#        else:
        map(proj, range(len(files)))
        progress.finish()

    prep.save_trafos(args.outputfolder)
Beispiel #9
0
def run(args, prep=None):
    if prep is None:
        prep = preprocess.Preprocess()

    if not args.labelfile or not args.inputfolder \
       or not args.outputfolder:
        print('WARNING: no labelfile or no inputfolder'
              ' or no outputfolder specified')

    print 'accumulate features:', args.accumulate

    if args.outputfolder and not os.path.exists(args.outputfolder):
        print 'outputfolder doesnt exist -> create'
        pc.mkdir_p(args.outputfolder)

    if args.load_scores:
        print 'try to load computed encodings'


    #####
    # UBM / loading
    print 'load gmm from', args.load_ubm
    ubm_gmm = None
    if args.load_ubm:
        ubm_gmm = loadGMM(args.load_ubm, args.lib)

    #####
    # Enrollment
    # now for each feature-set adapt a gmm
    #####
    if args.labelfile is None:
        print 'WARNING: no label-file'
    if args.concat_later:
        args.concat = True
    if args.concat:
        groups = None

        if args.group_word:
            descriptor_files = pc.getFilesGrouped(args.inputfolder, args.suffix)
            labels = None
        else:
            descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                       args.labelfile, exact=False,
                                       concat=True)
            print 'labels:', labels[0]
            if len(descriptor_files) != len(labels):
                raise ValueError('len(descriptor_files) {} !='
                             'len(labels) {}'.format(len(descriptor_files),
                                                 len(labels)))
        print 'num descr-files of first:', len(descriptor_files[0])

    else:
        descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                               args.labelfile)
    if args.maskfolder:
        maskfiles = pc.getMaskFiles(descriptor_files, args.suffix, args.maskfolder,
                                args.masksuffix)
    if len(descriptor_files) == 0:
        print 'no descriptor_files'
        sys.exit(1)
    if labels:
        num_scribes = len(list(set(labels)))
    else:
        num_scribes = 'unknown'

    num_descr = len(descriptor_files)
    print 'number of classes:', num_scribes
    print 'number of descriptor_files:', num_descr
    print 'adapt training-features to create individual scribe-gmms (or load saved ones)'
    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]
    progress = progressbar.ProgressBar(widgets=widgets,
                                       maxval=len(descriptor_files))

    if 'supervector' in args.encoding:
        identifier = '_sv'
    elif 'fisher' in args.encoding:
        identifier = '_fv'
    else:
        identifier = '_' + args.encoding

    identifier += '_' + args.update
    if len(args.normalize_enc) > 0:
        identifier += '_' + '_'.join(args.normalize_enc)

    encoder = Encoding(args.encoding, ubm_gmm, parallel=False,
                       normalize=args.normalize_enc, update=args.update,
                       relevance=args.relevance, nbest=args.nbest,
                       ratio=args.ratio,
                       accumulate=args.accumulate,
                       nprocs=args.nprocs)

    if args.posteriors_dir:
        posterior_files, _ = pc.getFiles(args.posteriors_dir, args.posteriors_suffix,
                                         args.labelfile)
        print len(posterior_files), len(descriptor_files)
        assert(len(posterior_files) == len(descriptor_files))

    cp = os.path.commonprefix(descriptor_files)
    #print cp
    def encode(i):
        if isinstance(descriptor_files[i], basestring):
            fname = descriptor_files[i]
            if os.path.isdir(cp):
                base = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                base = base.replace('.pkl.gz','')
            else:
                base = os.path.splitext(base)[0]

            if os.path.isdir(cp):
                folder = os.path.join(args.outputfolder,
                    os.path.dirname(base))
                # print 'should create: {} + {}'.format(args.outputfolder, base)
                pc.mkdir_p(folder,silent=True)
        else:
            base = os.path.basename(os.path.commonprefix(descriptor_files[i]))

        gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5')
        gmm = ubm_gmm

        scribe_gmm = None
        # load gmm if possible
        if args.load_gmm:
            gmm_file = os.path.join(args.load_gmm, gmm_name)
            scribe_gmm = load_gmm(gmm_file, args.lib)

        # load encoding
        if args.load_scores:
            if args.load_scores == 'outputfolder':
                load_f = args.outputfolder
            else:
                load_f = args.load_scores

            filepath = os.path.join(load_f, base + identifier + '.pkl.gz')
            if os.path.exists(filepath):
                with gzip.open(filepath, 'rb') as f:
                    enc = cPickle.load(f)
                    return enc, None
#            else:
#                print ('WARNING: encoding {} doesnt exist, compute'
#                        'it'.format(filepath ))


        if args.concat_later:
            enc = []
            for k in range(len(descriptor_files[i])):
                # load data and preprocess
                features = pc.loadDescriptors( descriptor_files[i][k],
                                      min_descs_per_file=args.min_descs, show_progress=(False if\
                                                                  args.concat else True))
                if features is None:
                    print 'features==None'
                    continue
                features = prep.transform(feature)

                enc_ = encoder.encode(features)
                enc.append(enc_)
            enc = np.concatenate(enc, axis=0)

        else:
            # load data and preprocess
            features = pc.loadDescriptors( descriptor_files[i],
                                          min_descs_per_file=args.min_descs,
                                          show_progress=(False if\
                                                         args.concat else
                                                         True)#,
                                         )
            posteriors = None
            if args.posteriors_dir:
                posteriors = pc.loadDescriptors( posterior_files[i] )
                assert(len(posteriors) == len(features))
            if not isinstance(features, np.ndarray) and not features:
                print 'features==None?'
                progress.update(i+1)
                return 0.0, None

            if i == 0:
                print '0-shape:',features.shape
            features = prep.transform(features)
            if i == 0:
                print '0-shape (possibly after pca):',features.shape

            if args.maskfolder:
                sample_weights = pc.loadDescriptors(maskfiles[i])
            else:
                sample_weights = None
            enc, scribe_gmm = encoder.encode(features, return_gmm=True,
                                             sample_weights=sample_weights,
                                             posteriors=posteriors,
                                             verbose=True if i == 0 else False)
            if i == 0:
                print '0-enc-shape', enc.shape
                if isinstance(sample_weights, np.ndarray):
                    print 'sample-weights shape:', sample_weights.shape
            # write
            if args.save_gmm:
                scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name)
                if 'bob' in args.lib:
                    scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') )
                else:
                    with gzip.open(scribe_gmm_filename, 'wb') as f:
                        cPickle.dump(scribe_gmm, f, -1)
                pc.verboseprint('wrote', scribe_gmm_filename)
                progress.update(i+1)

        if args.pq and args.load_pq:
            enc = prep.compress(enc, aug=args.aug)

        # save encoding
        filepath = os.path.join(args.outputfolder,
                                base + identifier + ('_pq' if\
                                args.pq else '') + '.pkl.gz')
        with gzip.open(filepath, 'wb') as f:
            cPickle.dump(enc, f, -1)

        progress.update(i+1)
        if 'nothing' in args.evaluate:
            return None, None
        return enc, scribe_gmm

    progress.start()
    if args.parallel:
        all_enc, all_gmms = zip( *pc.parmap( encode, range(num_descr),
                                            args.nprocs, size=num_descr) )
    else:
        all_enc, all_gmms = zip( *map( encode, range(num_descr) ) )
    progress.finish()
    if 'nothing' in args.evaluate:
        print 'nothing to evaluate, exit now'
        return

    print 'got {} encodings'.format(len(all_enc))

    all_enc = np.concatenate(all_enc, axis=0) #.astype(np.float32)

    print 'all_enc.shape', all_enc.shape

    print 'Evaluation:'

    stats = None
    ret_matrix = None

    for eval_method in args.evaluate:

        ret_matrix, stats = evaluate.runNN( all_enc, labels, distance=True, histogram=False,
                                               eval_method=eval_method,
                                               parallel=args.parallel,
                                               nprocs=args.nprocs)

        if ret_matrix is None or not isinstance(ret_matrix,np.ndarray):
            print 'WARNING: ret_matrix is None or not instance of np.ndarray'
        else:
            fpath = os.path.join(args.outputfolder, 'dist' + identifier
                                 + '_' + eval_method + '.cvs')
            np.savetxt(fpath, ret_matrix, delimiter=',')
            print 'saved', fpath
        return stats
Beispiel #10
0
            ' or no outputfolder specified')
    if args.outputfolder and not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)
    if not args.load_ubm:
        raise argparse.ArgumentTypeError('no gmm to load')

    #####
    # UBM-creation / loading
    print 'load gmm from', args.load_ubm
    ubm_gmm = loadUBM(args.load_ubm)

    #####
    # Enrollment
    # now for each feature-set adapt a gmm
    #####
    descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                           args.labelfile)

    if len(descriptor_files) == 0:
        print 'no descriptor_files'
        sys.exit(1)
    elif labels:
        num_scribes = len(list(set(labels)))
    else:
        num_scribes = 'unknown'

    num_descr = len(descriptor_files)
    print 'number of classes:', num_scribes
    print 'number of descriptor_files:', num_descr
    print 'adapt traing-features to create individual scribe-gmms (or load saved ones)'
    widgets = [
        progressbar.Percentage(), ' ',
Beispiel #11
0
                                         ' or no outputfolder specified')
    if args.outputfolder and not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)
    if not args.load_ubm:
        raise argparse.ArgumentTypeError('no gmm to load')

    #####
    # UBM-creation / loading
    print 'load gmm from', args.load_ubm
    ubm_gmm = loadUBM(args.load_ubm)

    #####
    # Enrollment
    # now for each feature-set adapt a gmm
    #####
    descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                               args.labelfile)

    if len(descriptor_files) == 0:
        print 'no descriptor_files'
        sys.exit(1)
    elif labels:
        num_scribes = len(list(set(labels)))
    else:
        num_scribes = 'unknown'

    num_descr = len(descriptor_files)
    print 'number of classes:', num_scribes
    print 'number of descriptor_files:', num_descr
    print 'adapt traing-features to create individual scribe-gmms (or load saved ones)'
    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]
def run(args, prep, write_stats=False):
    # create (or load) for each file an exemplar classifier
    # using the rest of the files as background class
    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                labelfile=args.labelfile)
    # all labels should differ!
    assert (len(set(labels)) == len(labels))

    # if we use classifiers as attributes then we need
    # background-classifiers independent from the training set
    if args.attribute: assert (args.bi)

    # additional background descriptors
    if len(args.bi) > 0:
        assert (len(args.bi) == len(args.bl))
        bg_files, bg_labels = pc.getFiles(args.bi,
                                          args.suffix,
                                          labelfile=args.bl,
                                          concat=True)
        #        bg_files = []
        #        bg_labels = []
        #        for e,bi in enumerate(args.bi):
        #            tmp_bg_files, tmp_bg_labels = pc.getFiles(bi, args.suffix,
        #                                      labelfile=args.bl[e])
        # Don't need this assert since the background labels are allowed
        # to appear multiple times
        #            assert( len(list(set(tmp_bg_labels))) == len(tmp_bg_labels) )
        #            bg_files.extend(tmp_bg_files)
        #            bg_labels.extend(tmp_bg_labels)

        #        assert( len(list(set(bg_labels+labels))) == len(bg_labels+labels) )
        assert (len(set(labels).intersection(set(bg_labels))) == 0)

    ex_cls = []
    if args.load_ex_cls:
        for f in files:
            ex_cls.append(pc.load(f))
    else:
        if (not args.scale and not args.load_trafo == 'scaler') and\
           ('svm' in args.clsname or args.clsname == 'sgd'):
            print 'WARNING: svm or sgd chosen but not --scale!'

        all_cls = args.func(args)
        if not all_cls:
            raise ValueError('no classifier given')
        the_cls = all_cls[0]

        print 'load:', args.inputfolder
        descr = pc.loadDescriptors(files)
        print 'shape:', descr.shape
        if len(args.bi) > 0:
            print 'load descriptors of: ' + ','.join(args.bi)
            descr_bg = pc.loadDescriptors(bg_files)
            print 'shape:', descr_bg.shape
            if not args.attribute:
                descr = np.concatenate([descr, descr_bg], axis=0)
                print 'concat shape:', descr.shape

        print 'pre descr[0]', descr[0]
        print 'fit-transform'
        descr = prep.fit_transform(descr)
        print 'post descr[0]', descr[0]
        print 'possible new shape:', descr.shape
        prep.save_trafos(args.outputfolder)

        if args.attribute:
            descr_bg = prep.transform(descr_bg)
            print 'compute attribute space, dim=', len(descr_bg)
            ex_cls_bg = computeExCls(descr_bg,
                                     the_cls,
                                     len(descr_bg),
                                     args.outputfolder,
                                     bg_labels,
                                     '_attr.pkl.gz',
                                     parallel=args.parallel)
            descr = exemplar_cls.predictExemplarCls(descr, ex_cls_bg)
            # platt calibration
            #            ab_list = computeAB(descr_bg, ex_cls_bg, bg_labels)
            #            descr = convertToProbs(descr, ab_list)
            print 'new descr-shape:', descr.shape

        ex_cls = computeExCls(descr,
                              the_cls,
                              len(files),
                              args.outputfolder,
                              labels,
                              parallel=args.parallel)

        # platt calibration


#        ab_list = computeAB(descr, ex_cls, labels)

    print 'load test:', args.pi
    files_probe, labels_probe = pc.getFiles(args.pi,
                                            args.suffix,
                                            labelfile=args.pl)

    print 'predict now'
    scores = predict(files_probe, ex_cls, prep, parallel=args.parallel)
    # this is our scores-matrix
    scores_mat = np.concatenate(scores, axis=0)
    stats = evaluate.computeStats('sum/max',
                                  scores_mat,
                                  labels_probe,
                                  labels,
                                  distance=False,
                                  parallel=args.parallel)
    if write_stats:
        evaluate.write_stats(os.path.join(args.outputfolder, 'stats.txt'),
                             stats)
Beispiel #13
0
                            help='detector type')
    feat_group.add_argument('--feature', '--descriptor', 
                            default='SIFT',\
                            help='feature type')
    return parser

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Some Feature Extractionmethods")
    parser = pc.commonArguments(parser)
    parser = parseArgs(parser)
    args = parser.parse_args()

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)
        
    files,_ = pc.getFiles(args.inputfolder, args.suffix, args.labelfile)
    if not files or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    all_features = []

    fe = FeatureEx(args.detector, args.feature)

    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]
    progress = progressbar.ProgressBar(widgets=widgets,
                                      maxval=len(files))

    progress.start()
    def compute(i):
Beispiel #14
0
def run(args, prep=None):
    if prep == None:
        prep = preprocess.Preprocess()
    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                exact=args.exact,
                                max_files=args.max_files)
    if files is None or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    maskfiles = pc.getMaskFiles(files, args.suffix, args.maskfolder,
                                args.masksuffix)
    if len(args.max_descriptors) == 0:
        descriptors, rand_indices = pc.loadDescriptors(
            files, rand=True, return_random_indices=True)
    else:
        max_descs_per_file = int(args.max_descriptors[0] / float(len(files)))
        max_descs_per_file = max(max_descs_per_file, 1)
        descriptors, rand_indices = pc.loadDescriptors(files,\
                                                        max_descs=args.max_descriptors[0],
                                                        max_descs_per_file=max_descs_per_file,
                                                        rand=True,
                                                        maskfiles=maskfiles,
                                                        return_random_indices=True)

    print 'got {} features'.format(len(descriptors))
    print 'features.shape', descriptors.shape

    # load features to train a universal background gmm
    print 'load features for training ubm from {} files'.format(len(files))

    if args.method == 'posteriors':
        posteriors_files, _ = pc.getFiles(args.posteriors_dir,
                                          args.posteriors_suffix,
                                          labelfile=args.labelfile,
                                          exact=args.exact,
                                          max_files=args.max_files)
        assert (len(posteriors_files) == len(files))
        indices = []

        widgets = [
            progressbar.Percentage(), ' ',
            progressbar.Bar(), ' ',
            progressbar.ETA()
        ]
        progress = progressbar.ProgressBar(widgets=widgets,
                                           maxval=len(posteriors_files))
        progress.start()
        for e, f in enumerate(posteriors_files):
            posteriors = pc.loadDescriptors(f)
            posteriors = posteriors[rand_indices[e]]
            cluster_idx = posteriors.argmax(axis=1)
            indices.append(cluster_idx)
            progress.update(e + 1)
        progress.finish()

        indices = np.concatenate(indices)
        assert (len(indices) == len(descriptors))
        means = recomputeMeans(descriptors, indices)
        vocabulary = cluster.KMeans(means.shape[0])  # dummy
        vocabulary.means_ = means
        vocabulary.type_ = 'kmeans'
    else:
        vocabulary = computeVocabulary(descriptors, args.method,
                                       args.num_clusters, args.iterations,
                                       args.gmm_update, args.lib,
                                       args.covar_type, args.nprocs)

    # TODO: rewrite to be more generic
    if 'sparse' in args.method and 'gmm' in args.method:
        gmm = mixture.GMM(args.num_clusters,
                          n_iter=args.iterations,
                          params=args.gmm_update,
                          init_params='wc')
        gmm.means_ = vocabulary.reshape(args.num_clusters, -1)
        gmm.fit(descriptors)
        vocabulary = gmm

    if args.predict:
        pred = vocabulary.predict(descriptors)
        pred_prob = None
        if 'predict_proba' in dir(vocabulary):
            pred_prob = vocabulary.predict_proba(descriptors)
        for i, f in enumerate(files):
            if pred_prob:
                print '{}\t[{}], ([{}])'.format(os.path.basename(f), pred[i],
                                                pred_prob[i])
            else:
                print '{}\t[{}]'.format(os.path.basename(f), pred[i])

    # save gmm
    voc_filepath = os.path.join(
        args.outputfolder,
        (args.vocabulary_filename
         if args.vocabulary_filename != None else args.method) + 'pkl.gz')
    with gzip.open(voc_filepath, 'wb') as f:
        cPickle.dump(vocabulary, f, -1)
    print 'saved vocabulary at', voc_filepath

    if args.method == 'gmm':
        try:
            aic = vocabulary.aic(descriptors)
            print 'aic:', aic
            with open(os.path.join(args.outputfolder, 'aic.txt'), 'a') as f:
                f.write('{}\n'.format(aic))
        except:
            raise


#            print('couldnt compute aic, error: {}'.format(e))

    return os.path.abspath(voc_filepath)
Beispiel #15
0
    parser = addArguments(parser)
    args = parser.parse_args()

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    if not args.scale and not args.load_scaler and\
       ('svm' in args.clsname or args.clsname == 'sgd'):
        print 'WARNING: svm or sgd chosen but not --scale!'

    all_cls = args.func(args)
    if not all_cls:
        print 'no classifier given'

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                labelfile=args.labelfile)
    files = np.array(files)
    labels = np.array(labels)

    # these are our background / negative training files
    b_files, b_labels = pc.getFiles(args.bi,
                                    args.bs if args.bs else args.suffix,
                                    labelfile=args.bl)

    # let's first test shapes
    test_f = pc.loadDescriptors(files[0])
    b_test_f = pc.loadDescriptors(b_files[0])
    assert (test_f.shape[1] == b_test_f.shape[1])
    print 'descriptor-dimension:', test_f.shape[1]
Beispiel #16
0
                            default='SIFT',\
                            help='feature type')
    return parser


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Some Feature Extractionmethods")
    parser = pc.commonArguments(parser)
    parser = parseArgs(parser)
    args = parser.parse_args()

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, _ = pc.getFiles(args.inputfolder, args.suffix, args.labelfile)
    if not files or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    all_features = []

    fe = FeatureEx(args.detector, args.feature)

    widgets = [
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ]
    progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files))