def process_dists(idx_start, y_each, y_list, path_embeddings, sample_count, classes_size, embedding_size, triplet_similarity, mode): try: path_emb_json = f'{path_embeddings}/{y_each}.json' path_emb_mem = f'{path_embeddings}/{y_each}.mmap' path_dists_mem = f'{path_embeddings}/dists.mmap' dists_mem = np.memmap(path_dists_mem, mode='r+', dtype=np.float16, shape=(sample_count, classes_size)) emb_json = FileUtils.loadJSON(path_emb_json) emb_mem = np.memmap(path_emb_mem, mode='r', dtype=np.float16, shape=(emb_json['count'], embedding_size)) path_centroids_mem = f'{path_embeddings}/dists.mmap' centroids_mem = np.memmap(path_centroids_mem, mode='r', dtype=np.float16, shape=(classes_size, embedding_size)) for idx_y in y_list: np_class_centroids_tiled = np.tile(centroids_mem[idx_y], (emb_json['count'], 1)) dists = get_distance(emb_mem, np_class_centroids_tiled, triplet_similarity, mode).tolist() dists_mem[idx_start:idx_start + emb_json['count'], idx_y] = dists[:] #dists_mem.flush() except Exception as e: logging.error(str(e)) exc_type, exc_value, exc_tb = sys.exc_info() logging.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_tb)))
def calculate_accuracy( path_embeddings, meter_acc: tnt.meter.ClassErrorMeter, meter_auc: tnt.meter.AUCMeter, type='range', norm='l2', triplet_similarity='cos', mode='cpu', embedding_size=None, class_max_dist=None, # precomputed class_centroids=None, y_list=None, #precumputed sample_count=None, #precomputed paths_embs_idx_path_pairs=None): # precomputed paths_embs = FileUtils.listSubFiles(path_embeddings) # calculate centroids first if class_max_dist is None: class_centroids = {} class_max_dist = {} y_list = [] paths_embs_idx_path_pairs = [] sample_count = 0 for path_emb in paths_embs: if path_emb.endswith('.json'): y_each = int(os.path.basename(path_emb).split('.')[0]) path_emb_json = f'{path_embeddings}/{y_each}.json' path_emb_mem = f'{path_embeddings}/{y_each}.mmap' emb_json = FileUtils.loadJSON(path_emb_json) emb_mem = np.memmap(path_emb_mem, mode='r', dtype=np.float16, shape=(emb_json['count'], embedding_size)) paths_embs_idx_path_pairs.append((sample_count, y_each)) sample_count += emb_json['count'] y_list += (np.ones( (emb_json['count'], ), dtype=np.int) * y_each).tolist() class_centroids[y_each] = np.average(emb_mem, axis=0) if norm == 'l2': class_centroids[y_each] = normalize_vec( class_centroids[y_each]) np_class_centroids_tiled = np.tile(class_centroids[y_each], (len(emb_mem), 1)) list_dists = get_distance(np_class_centroids_tiled, emb_mem, triplet_similarity, mode).tolist() list_dists = sorted(list_dists, reverse=False) list_dists = list_dists[:max( 2, int(len(list_dists) * 0.9) )] # drop 10 top percent embeddings as they could contain noise class_max_dist[y_each] = list_dists[ -1] # last largest distance classes_size = int(np.max(y_list)) + 1 # store distance matrix as memmap for optimization path_dists_mem = f'{path_embeddings}/dists.mmap' is_exist_dists_mem = os.path.exists(path_dists_mem) dists_mem = np.memmap(path_dists_mem, mode='r+' if is_exist_dists_mem else 'w+', dtype=np.float16, shape=(sample_count, classes_size)) #dists_mem.flush() path_centroids_mem = f'{path_embeddings}/dists.mmap' is_exist_centroids_mem = os.path.exists(path_centroids_mem) centroids_mem = np.memmap(path_centroids_mem, mode='r+' if is_exist_centroids_mem else 'w+', dtype=np.float16, shape=(classes_size, embedding_size)) for key, value in class_centroids.items(): centroids_mem[key] = value #centroids_mem.flush() if not is_exist_dists_mem: Parallel(n_jobs=multiprocessing.cpu_count() * 2, backend='threading')( delayed(process_dists)(idx_start, y_each, y_list, path_embeddings, sample_count, classes_size, embedding_size, triplet_similarity, mode) for idx_start, y_each in paths_embs_idx_path_pairs) dists_mem = np.memmap(path_dists_mem, mode='r', dtype=np.float16, shape=(sample_count, classes_size)) # iterate through precomputed distances to add to data to meters for mem optimization chunk_size = 1024 for idx_chunk_start in range(sample_count // chunk_size + 1): idx_chunk_end = min(sample_count, idx_chunk_start + chunk_size) chunk_each_size = idx_chunk_end - idx_chunk_start if chunk_each_size == 0: break if type == 'range': predicted = np.zeros((chunk_each_size, classes_size), dtype=np.float) else: predicted = np.ones( (chunk_each_size, classes_size), dtype=np.float) * 1e9 target = np.zeros((chunk_each_size, classes_size), dtype=np.float) for idx_y in class_max_dist.keys(): max_dist = class_max_dist[idx_y] for idx_class in range(chunk_each_size): target[idx_class, y_list[idx_chunk_start + idx_class]] = 1.0 dists = dists_mem[idx_chunk_start:idx_chunk_end] if type == 'range': for idx_emb, dist in enumerate(dists): if max_dist > dist[idx_y]: predicted[idx_emb, idx_y] += 1.0 else: predicted[:, idx_y] = np.minimum( predicted[:, idx_y], dists[:, idx_y] ) # store for each class closest embedding with distance value if type == 'range': predicted = predicted / (np.sum(predicted, axis=1, keepdims=True) + 1e-18) else: # TODO softmax/hardmax based accuracy idx_class = np.argmin( predicted, axis=1) # for each sample select closest distance predicted = np.zeros_like(predicted) # init probabilities vector predicted[ np.arange(predicted.shape[0]), idx_class] = 1.0 # for each sample set prob 100% by columns y_chunk = np.array(y_list[idx_chunk_start:idx_chunk_end]) meter_acc.add(predicted, y_chunk) # AssertionError: targets should be binary (0, 1) idxes_classes = np.argmax(predicted, axis=1) target_tp = np.array(np.equal(y_chunk, idxes_classes), dtype=np.int) meter_auc.add(np.max(predicted, axis=1), target_tp) return class_max_dist, class_centroids, y_list, sample_count, paths_embs_idx_path_pairs