def compute_statistics(src_cfg, **kwargs): """ Computes sufficient statistics needed for the bag-of-words or Fisher vector model. """ # Default parameters. ip_type = kwargs.get("ip_type", "dense5.track15mbh") suffix = kwargs.get("suffix", "") dataset = Dataset(src_cfg, ip_type=ip_type, suffix=suffix) nr_clusters = kwargs.get("nr_clusters", 128) dataset.VOC_SIZE = nr_clusters model_type = kwargs.get("model_type", "fv") worker_type = kwargs.get("worker_type", "normal") outfilename = kwargs.get("outfilename", "stats.tmp") if worker_type == "normal": worker = compute_statistics_from_video_worker outfilename = kwargs.get("outfilename", "stats.tmp") elif worker_type == "per_slice": from per_slice.compute_sstats_worker import compute_statistics_worker worker = compute_statistics_worker if kwargs.has_key("spm"): from spatial_pyramids import compute_statistics_worker worker = compute_statistics_worker outfilename = "stats.tmp_spm%d%d%d" % kwargs.get("spm") fn_pca = os.path.join(dataset.FEAT_DIR, "pca", "pca_64.pkl") pca = kwargs.get("pca", load_pca(fn_pca)) fn_gmm = os.path.join(dataset.FEAT_DIR, "gmm", "gmm_%d" % nr_clusters) gmm = kwargs.get("gmm", load_gmm(fn_gmm)) descs_to_sstats = Model(model_type, gmm).descs_to_sstats nr_processes = kwargs.get("nr_processes", multiprocessing.cpu_count()) train_samples, train_labels = dataset.get_data("train") test_samples, test_labels = dataset.get_data("test") _samples = train_samples + test_samples _labels = train_labels + test_labels samples, labels = get_tupled_data(_samples, _labels) sstats_out = SstatsMap(os.path.join(dataset.FEAT_DIR, "statistics_k_%d" % nr_clusters, outfilename)) if nr_processes > 1: import multiprocessing as mp processes = [] nr_samples_per_process = len(samples) // nr_processes + 1 for ii in xrange(nr_processes): process = mp.Process( target=worker, args=( dataset, samples[ii * nr_samples_per_process : (ii + 1) * nr_samples_per_process], labels[ii * nr_samples_per_process : (ii + 1) * nr_samples_per_process], sstats_out, descs_to_sstats, pca, gmm, ), kwargs=kwargs, ) processes.append(process) process.start() # Wait for jobs to finish. for process in processes: process.join() else: # We use this special case, because it makes possible to debug. worker(dataset, samples, labels, sstats_out, descs_to_sstats, pca, gmm, **kwargs)