Ejemplo n.º 1
0
def process_dists(idx_start, y_each, y_list, path_embeddings, sample_count,
                  classes_size, embedding_size, triplet_similarity, mode):
    try:
        path_emb_json = f'{path_embeddings}/{y_each}.json'
        path_emb_mem = f'{path_embeddings}/{y_each}.mmap'
        path_dists_mem = f'{path_embeddings}/dists.mmap'

        dists_mem = np.memmap(path_dists_mem,
                              mode='r+',
                              dtype=np.float16,
                              shape=(sample_count, classes_size))

        emb_json = FileUtils.loadJSON(path_emb_json)
        emb_mem = np.memmap(path_emb_mem,
                            mode='r',
                            dtype=np.float16,
                            shape=(emb_json['count'], embedding_size))

        path_centroids_mem = f'{path_embeddings}/dists.mmap'
        centroids_mem = np.memmap(path_centroids_mem,
                                  mode='r',
                                  dtype=np.float16,
                                  shape=(classes_size, embedding_size))

        for idx_y in y_list:
            np_class_centroids_tiled = np.tile(centroids_mem[idx_y],
                                               (emb_json['count'], 1))
            dists = get_distance(emb_mem, np_class_centroids_tiled,
                                 triplet_similarity, mode).tolist()
            dists_mem[idx_start:idx_start + emb_json['count'],
                      idx_y] = dists[:]
        #dists_mem.flush()
    except Exception as e:
        logging.error(str(e))
        exc_type, exc_value, exc_tb = sys.exc_info()
        logging.error('\n'.join(
            traceback.format_exception(exc_type, exc_value, exc_tb)))
Ejemplo n.º 2
0
def calculate_accuracy(
        path_embeddings,
        meter_acc: tnt.meter.ClassErrorMeter,
        meter_auc: tnt.meter.AUCMeter,
        type='range',
        norm='l2',
        triplet_similarity='cos',
        mode='cpu',
        embedding_size=None,
        class_max_dist=None,  # precomputed
        class_centroids=None,
        y_list=None,  #precumputed
        sample_count=None,  #precomputed
        paths_embs_idx_path_pairs=None):  # precomputed

    paths_embs = FileUtils.listSubFiles(path_embeddings)

    # calculate centroids first
    if class_max_dist is None:
        class_centroids = {}
        class_max_dist = {}
        y_list = []
        paths_embs_idx_path_pairs = []
        sample_count = 0

        for path_emb in paths_embs:
            if path_emb.endswith('.json'):
                y_each = int(os.path.basename(path_emb).split('.')[0])
                path_emb_json = f'{path_embeddings}/{y_each}.json'
                path_emb_mem = f'{path_embeddings}/{y_each}.mmap'

                emb_json = FileUtils.loadJSON(path_emb_json)
                emb_mem = np.memmap(path_emb_mem,
                                    mode='r',
                                    dtype=np.float16,
                                    shape=(emb_json['count'], embedding_size))

                paths_embs_idx_path_pairs.append((sample_count, y_each))
                sample_count += emb_json['count']

                y_list += (np.ones(
                    (emb_json['count'], ), dtype=np.int) * y_each).tolist()

                class_centroids[y_each] = np.average(emb_mem, axis=0)
                if norm == 'l2':
                    class_centroids[y_each] = normalize_vec(
                        class_centroids[y_each])

                np_class_centroids_tiled = np.tile(class_centroids[y_each],
                                                   (len(emb_mem), 1))
                list_dists = get_distance(np_class_centroids_tiled, emb_mem,
                                          triplet_similarity, mode).tolist()
                list_dists = sorted(list_dists, reverse=False)
                list_dists = list_dists[:max(
                    2, int(len(list_dists) * 0.9)
                )]  # drop 10 top percent embeddings as they could contain noise
                class_max_dist[y_each] = list_dists[
                    -1]  # last largest distance

    classes_size = int(np.max(y_list)) + 1

    # store distance matrix as memmap for optimization
    path_dists_mem = f'{path_embeddings}/dists.mmap'
    is_exist_dists_mem = os.path.exists(path_dists_mem)
    dists_mem = np.memmap(path_dists_mem,
                          mode='r+' if is_exist_dists_mem else 'w+',
                          dtype=np.float16,
                          shape=(sample_count, classes_size))
    #dists_mem.flush()

    path_centroids_mem = f'{path_embeddings}/dists.mmap'
    is_exist_centroids_mem = os.path.exists(path_centroids_mem)
    centroids_mem = np.memmap(path_centroids_mem,
                              mode='r+' if is_exist_centroids_mem else 'w+',
                              dtype=np.float16,
                              shape=(classes_size, embedding_size))
    for key, value in class_centroids.items():
        centroids_mem[key] = value
    #centroids_mem.flush()

    if not is_exist_dists_mem:
        Parallel(n_jobs=multiprocessing.cpu_count() * 2, backend='threading')(
            delayed(process_dists)(idx_start, y_each, y_list, path_embeddings,
                                   sample_count, classes_size, embedding_size,
                                   triplet_similarity, mode)
            for idx_start, y_each in paths_embs_idx_path_pairs)

        dists_mem = np.memmap(path_dists_mem,
                              mode='r',
                              dtype=np.float16,
                              shape=(sample_count, classes_size))

    # iterate through precomputed distances to add to data to meters for mem optimization
    chunk_size = 1024
    for idx_chunk_start in range(sample_count // chunk_size + 1):
        idx_chunk_end = min(sample_count, idx_chunk_start + chunk_size)
        chunk_each_size = idx_chunk_end - idx_chunk_start

        if chunk_each_size == 0:
            break

        if type == 'range':
            predicted = np.zeros((chunk_each_size, classes_size),
                                 dtype=np.float)
        else:
            predicted = np.ones(
                (chunk_each_size, classes_size), dtype=np.float) * 1e9
        target = np.zeros((chunk_each_size, classes_size), dtype=np.float)

        for idx_y in class_max_dist.keys():
            max_dist = class_max_dist[idx_y]
            for idx_class in range(chunk_each_size):
                target[idx_class, y_list[idx_chunk_start + idx_class]] = 1.0

            dists = dists_mem[idx_chunk_start:idx_chunk_end]

            if type == 'range':
                for idx_emb, dist in enumerate(dists):
                    if max_dist > dist[idx_y]:
                        predicted[idx_emb, idx_y] += 1.0
            else:
                predicted[:, idx_y] = np.minimum(
                    predicted[:, idx_y], dists[:, idx_y]
                )  # store for each class closest embedding with distance value

        if type == 'range':
            predicted = predicted / (np.sum(predicted, axis=1, keepdims=True) +
                                     1e-18)
        else:
            # TODO softmax/hardmax based accuracy
            idx_class = np.argmin(
                predicted, axis=1)  # for each sample select closest distance
            predicted = np.zeros_like(predicted)  # init probabilities vector
            predicted[
                np.arange(predicted.shape[0]),
                idx_class] = 1.0  # for each sample set prob 100% by columns
        y_chunk = np.array(y_list[idx_chunk_start:idx_chunk_end])
        meter_acc.add(predicted, y_chunk)

        # AssertionError: targets should be binary (0, 1)
        idxes_classes = np.argmax(predicted, axis=1)
        target_tp = np.array(np.equal(y_chunk, idxes_classes), dtype=np.int)
        meter_auc.add(np.max(predicted, axis=1), target_tp)

    return class_max_dist, class_centroids, y_list, sample_count, paths_embs_idx_path_pairs