Ejemplo n.º 1
0
FileUtils.createDir(args.path_output)
logging_utils = LoggingUtils(
    f"{args.path_output}/simpsons-{datetime.now().strftime('%y-%m-%d_%H-%M-%S')}.log"
)

class_names = []
last_class_name = None

mmap_shape = [0, 3, args.size_img, args.size_img]

logging_utils.info(
    f'move test samples into train to change from classification to re-identification task'
)

paths_files = FileUtils.listSubFiles(args.path_input_test)
for path_file in paths_files:
    base_path_file = os.path.basename(path_file)
    base_path_file = base_path_file[:-4]  # remove .jpg at the end
    str_sample_idx = base_path_file[base_path_file.rindex('_') + 1:]
    base_path_file = base_path_file[:base_path_file.rindex(
        '_')]  # remove sample idx like _22
    if os.path.exists(f'{args.path_input_train}/{base_path_file}'):
        os.rename(
            path_file,
            f'{args.path_input_train}/{base_path_file}/test_{str_sample_idx}.jpg'
        )
    else:
        LoggingUtils.error(
            f'not exiting, cannot move from test: {args.path_input_train}/{base_path_file}'
        )
Ejemplo n.º 2
0
def calculate_accuracy(
        path_embeddings,
        meter_acc: tnt.meter.ClassErrorMeter,
        meter_auc: tnt.meter.AUCMeter,
        type='range',
        norm='l2',
        triplet_similarity='cos',
        mode='cpu',
        embedding_size=None,
        class_max_dist=None,  # precomputed
        class_centroids=None,
        y_list=None,  #precumputed
        sample_count=None,  #precomputed
        paths_embs_idx_path_pairs=None):  # precomputed

    paths_embs = FileUtils.listSubFiles(path_embeddings)

    # calculate centroids first
    if class_max_dist is None:
        class_centroids = {}
        class_max_dist = {}
        y_list = []
        paths_embs_idx_path_pairs = []
        sample_count = 0

        for path_emb in paths_embs:
            if path_emb.endswith('.json'):
                y_each = int(os.path.basename(path_emb).split('.')[0])
                path_emb_json = f'{path_embeddings}/{y_each}.json'
                path_emb_mem = f'{path_embeddings}/{y_each}.mmap'

                emb_json = FileUtils.loadJSON(path_emb_json)
                emb_mem = np.memmap(path_emb_mem,
                                    mode='r',
                                    dtype=np.float16,
                                    shape=(emb_json['count'], embedding_size))

                paths_embs_idx_path_pairs.append((sample_count, y_each))
                sample_count += emb_json['count']

                y_list += (np.ones(
                    (emb_json['count'], ), dtype=np.int) * y_each).tolist()

                class_centroids[y_each] = np.average(emb_mem, axis=0)
                if norm == 'l2':
                    class_centroids[y_each] = normalize_vec(
                        class_centroids[y_each])

                np_class_centroids_tiled = np.tile(class_centroids[y_each],
                                                   (len(emb_mem), 1))
                list_dists = get_distance(np_class_centroids_tiled, emb_mem,
                                          triplet_similarity, mode).tolist()
                list_dists = sorted(list_dists, reverse=False)
                list_dists = list_dists[:max(
                    2, int(len(list_dists) * 0.9)
                )]  # drop 10 top percent embeddings as they could contain noise
                class_max_dist[y_each] = list_dists[
                    -1]  # last largest distance

    classes_size = int(np.max(y_list)) + 1

    # store distance matrix as memmap for optimization
    path_dists_mem = f'{path_embeddings}/dists.mmap'
    is_exist_dists_mem = os.path.exists(path_dists_mem)
    dists_mem = np.memmap(path_dists_mem,
                          mode='r+' if is_exist_dists_mem else 'w+',
                          dtype=np.float16,
                          shape=(sample_count, classes_size))
    #dists_mem.flush()

    path_centroids_mem = f'{path_embeddings}/dists.mmap'
    is_exist_centroids_mem = os.path.exists(path_centroids_mem)
    centroids_mem = np.memmap(path_centroids_mem,
                              mode='r+' if is_exist_centroids_mem else 'w+',
                              dtype=np.float16,
                              shape=(classes_size, embedding_size))
    for key, value in class_centroids.items():
        centroids_mem[key] = value
    #centroids_mem.flush()

    if not is_exist_dists_mem:
        Parallel(n_jobs=multiprocessing.cpu_count() * 2, backend='threading')(
            delayed(process_dists)(idx_start, y_each, y_list, path_embeddings,
                                   sample_count, classes_size, embedding_size,
                                   triplet_similarity, mode)
            for idx_start, y_each in paths_embs_idx_path_pairs)

        dists_mem = np.memmap(path_dists_mem,
                              mode='r',
                              dtype=np.float16,
                              shape=(sample_count, classes_size))

    # iterate through precomputed distances to add to data to meters for mem optimization
    chunk_size = 1024
    for idx_chunk_start in range(sample_count // chunk_size + 1):
        idx_chunk_end = min(sample_count, idx_chunk_start + chunk_size)
        chunk_each_size = idx_chunk_end - idx_chunk_start

        if chunk_each_size == 0:
            break

        if type == 'range':
            predicted = np.zeros((chunk_each_size, classes_size),
                                 dtype=np.float)
        else:
            predicted = np.ones(
                (chunk_each_size, classes_size), dtype=np.float) * 1e9
        target = np.zeros((chunk_each_size, classes_size), dtype=np.float)

        for idx_y in class_max_dist.keys():
            max_dist = class_max_dist[idx_y]
            for idx_class in range(chunk_each_size):
                target[idx_class, y_list[idx_chunk_start + idx_class]] = 1.0

            dists = dists_mem[idx_chunk_start:idx_chunk_end]

            if type == 'range':
                for idx_emb, dist in enumerate(dists):
                    if max_dist > dist[idx_y]:
                        predicted[idx_emb, idx_y] += 1.0
            else:
                predicted[:, idx_y] = np.minimum(
                    predicted[:, idx_y], dists[:, idx_y]
                )  # store for each class closest embedding with distance value

        if type == 'range':
            predicted = predicted / (np.sum(predicted, axis=1, keepdims=True) +
                                     1e-18)
        else:
            # TODO softmax/hardmax based accuracy
            idx_class = np.argmin(
                predicted, axis=1)  # for each sample select closest distance
            predicted = np.zeros_like(predicted)  # init probabilities vector
            predicted[
                np.arange(predicted.shape[0]),
                idx_class] = 1.0  # for each sample set prob 100% by columns
        y_chunk = np.array(y_list[idx_chunk_start:idx_chunk_end])
        meter_acc.add(predicted, y_chunk)

        # AssertionError: targets should be binary (0, 1)
        idxes_classes = np.argmax(predicted, axis=1)
        target_tp = np.array(np.equal(y_chunk, idxes_classes), dtype=np.int)
        meter_auc.add(np.max(predicted, axis=1), target_tp)

    return class_max_dist, class_centroids, y_list, sample_count, paths_embs_idx_path_pairs