Exemple #1
0
def _get_shared_data():  # pragma: no cover
    """multiprocessing helper function to extract numpy arrays from
    shared RawArray types used to shared data across process."""

    s_counts = _to_np_array(mp_params['s_counts'])
    prune_count_matrix = _to_np_array(
        mp_arr=mp_params['prune_count_matrix'],
        shape=mp_params['pcm_shape'],
    )
    psx = _to_np_array(
        mp_arr=mp_params['psx'],
        dtype='float32',
        shape=mp_params['psx_shape'],
    )
    multi_label = mp_params['multi_label']
    if multi_label:  # Shared data is passed as one-hot encoded matrix
        print('before', mp_params['s'])
        s = onehot2int(
            _to_np_array(
                mp_arr=mp_params['s'],
                shape=(psx.shape[0], psx.shape[1]),
            ))
        print('after', s)
    else:
        s = _to_np_array(mp_params['s'])
    return s, s_counts, prune_count_matrix, psx, multi_label
def find_noisy_labels(prediction, labels, out, multi_label):
    # create folder if not exists
    os.makedirs(out, exist_ok=True)
    # sort files by Images names
    pred_df = pd.read_csv(prediction)
    pred_df = pred_df.sort_values(by=['Images'])

    label_df = pd.read_csv(labels)
    label_df = label_df.sort_values(by=['Images'])
    label_df[list(label_df)[1:]] = 1 * (label_df[list(label_df)[1:]] >= 0.5)

    # print(label_df)
    assert len(pred_df) == len(
        label_df), "Mismatch between predictions and labels"

    psx = np.array(pred_df[list(pred_df)[1:]])
    _labels = np.array(label_df[list(label_df)[1:]])

    if multi_label:
        correctly_formatted_labels = onehot2int(_labels)
        label_errors_bool = cleanlab.pruning.get_noise_indices(
            s=correctly_formatted_labels,
            psx=psx,
            prune_method='prune_by_noise_rate',
            sorted_index_method=None,
            multi_label=True)
        label_errors_idx = cleanlab.pruning.order_label_errors(
            label_errors_bool=label_errors_bool,
            psx=psx,
            labels=correctly_formatted_labels,
            sorted_index_method='normalized_margin')
        nb_dict = {
            'Images': list(),
            'Labels': list(),
            'Prediction': list(),
            'Indices': list()
        }
        for i in label_errors_idx:
            nb_dict['Images'].append(label_df.iloc[i]['Images'])
            nb_dict['Labels'].append(','.join(list(map(str, _labels[i, :]))))
            nb_dict['Prediction'].append(','.join(list(map(str, psx[i, :]))))
            nb_dict['Indices'].append(i)

        nb_df = pd.DataFrame(nb_dict)
        nb_df.to_csv('noise_labels_multi.csv', index=False)
    else:
        header = list(pred_df.keys())[1:]

        for t in range(len(header)):
            pred_t = np.array(pred_df[header[t]])
            disease = header[t]
            # if disease not in list(label_df.keys()):
            #     raise ValueError("{} does not exists in ground truth labels!")
            label_t = np.array(label_df[disease])

            # generate noise labels
            binary = list()
            for i in range(len(pred_t)):
                binary.append([1 - pred_t[i], pred_t[i]])
            binary = np.array(binary)

            label_errors_bool = cleanlab.pruning.get_noise_indices(
                s=copy.deepcopy(label_t),
                psx=binary,
                prune_method='prune_by_noise_rate',
                sorted_index_method=None)

            label_errors_idx = cleanlab.pruning.order_label_errors(
                label_errors_bool=label_errors_bool,
                psx=binary,
                labels=copy.deepcopy(label_t),
                sorted_index_method='normalized_margin')

            nb_dict = {
                'Images': list(),
                header[t]: list(),
                'Prob': list(),
                'Indices': list()
            }
            for i in label_errors_idx:
                nb_dict['Images'].append(label_df.iloc[i]['Images'])
                nb_dict[header[t]].append(label_t[i])
                nb_dict['Prob'].append(float(pred_t[i]))
                nb_dict['Indices'].append(i)

            nb_df = pd.DataFrame(nb_dict)
            nb_df.to_csv(os.path.join(out, '{}.csv'.format(header[t])),
                         index=False)