def parse_cifar(dataset, mode): features = [] labels = [] coarse_labels = [] batch_names = [] TARFILE, label_data, label_labels, label_coarse = get_data_params(dataset) datanames = get_datanames(dataset, mode) try: spinner = Spinner(prefix="Loading {} data...".format(mode)) spinner.start() tf = tarfile.open(TARFILE) for dataname in datanames: ti = tf.getmember(dataname) data = unpickle(tf.extractfile(ti)) features.append(data[label_data]) labels.append(data[label_labels]) batch_names.extend([dataname.split('/')[1]] * len(data[label_data])) if dataset == 'cifar100superclass': coarse_labels.append(data[label_coarse]) features = np.concatenate(features) features = features.reshape(features.shape[0], 3, 32, 32) features = features.transpose(0, 2, 3, 1).astype('uint8') labels = np.concatenate(labels) if dataset == 'cifar100superclass': coarse_labels = np.concatenate(coarse_labels) spinner.stop() except KeyboardInterrupt: spinner.stop() sys.exit(1) return features, labels, coarse_labels, batch_names
def load_hash_dict(self, load_path, use_cache, target_dir): if load_path and Path(load_path).exists() and use_cache: logger.debug("Load hash cache: {}".format(load_path)) spinner = Spinner(prefix="Loading hash cache...") spinner.start() self.hash_dict = joblib.load(load_path) spinner.stop() is_update = self.update_hash_dict() return not is_update else: self.hash_dict = {} self.update_hash_dict() return False
def update_hash_dict(self): if self.num_proc is None: self.num_proc = cpu_count() - 1 # check current hash_dict current_files = set(self.image_filenames) cache_files = self.hash_dict.keys() lost_set = cache_files - current_files target_files = list(current_files - cache_files) if len(lost_set) + len(target_files) > 0: try: if len(self.hash_dict) == 0: spinner = Spinner( prefix= "Calculating image hashes (hash-bits={} num-proc={})..." .format(self.hash_bits, self.num_proc)) else: spinner = Spinner( prefix= "Updating image hashes (hash-bits={} num-proc={})...". format(self.hash_bits, self.num_proc)) spinner.start() # del lost_set from hash_dict for f in lost_set: del self.hash_dict[f] if six.PY2: from pathos.multiprocessing import ProcessPool as Pool elif six.PY3: from multiprocessing import Pool pool = Pool(self.num_proc) hashes = pool.map(self.gen_hash, target_files) for filename, hash_value in zip(target_files, hashes): self.hash_dict[filename] = hash_value spinner.stop() except KeyboardInterrupt: pool.terminate() pool.join() spinner.stop() sys.exit(1) return True else: return False
def load(self, load_path, use_cache, target_dir): if load_path and Path(load_path).exists() and use_cache: cache_mtime = self.check_mtime(load_path) target_mtime = self.check_latest_dir_mtime(target_dir) if cache_mtime > target_mtime: logger.debug("Load hash cache: {}".format(load_path)) spinner = Spinner(prefix="Loading hash cache...") spinner.start() self.cache = joblib.load(load_path) spinner.stop() return True else: self.cache = [] self.make_hash_list() return False else: self.cache = [] self.make_hash_list() return False
def make_hash_list(self): if self.num_proc is None: self.num_proc = cpu_count() - 1 try: spinner = Spinner( prefix="Calculating image hashes (hash-bits={} num-proc={})..." .format(self.hash_bits, self.num_proc)) spinner.start() if six.PY2: from pathos.multiprocessing import ProcessPool as Pool elif six.PY3: from multiprocessing import Pool pool = Pool(self.num_proc) self.cache = pool.map(self.gen_hash, self.image_filenames) spinner.stop() except KeyboardInterrupt: pool.terminate() pool.join() spinner.stop() sys.exit(1)