Exemple #1
0
def confusion_matrix(output, labels, num_labels):
    conf_meter = ConfusionMeter(num_labels)
    auc_meter = AUCMeter()
    preds = output.max(1)[1].type_as(labels)
    conf_meter.add(preds.data.squeeze(), labels.type(torch.LongTensor).data)
    auc_meter.add(preds.data.squeeze(), labels.data.squeeze())
    return conf_meter, auc_meter
Exemple #2
0
 def __init__(self, labels, logger=None):
     # self._labels = labels
     self._logger = logger
     self._loss = []
     self._acc = []
     self._labels = labels
     self._conf = ConfusionMeter(len(self._labels))
     # self._auc = {label: AUCMeter() for label in range(len(labels))}
     self._auc = AUCMeter()
     self._c_inp, self._c_tar = Counter(), Counter()
Exemple #3
0
class ModelMeter:
    def __init__(self, labels, logger=None):
        # self._labels = labels
        self._logger = logger
        self._loss = []
        self._acc = []
        self._labels = labels
        self._conf = ConfusionMeter(len(self._labels))
        # self._auc = {label: AUCMeter() for label in range(len(labels))}
        self._auc = AUCMeter()
        self._c_inp, self._c_tar = Counter(), Counter()

    def update(self, loss, output, targets):
        self._loss.append(loss)
        self._acc.append(accuracy(output, targets))

        preds = output.max(1)[1].type_as(targets)
        p = preds.cpu().data.numpy()
        t = targets.cpu().data.numpy()
        cur_indexes = np.logical_and(p != 2, t != 2)
        self._auc.add(p[cur_indexes], t[cur_indexes])
        self._conf.add(preds.data.squeeze(), targets.data.squeeze())

        self._c_inp.update(p)
        self._c_tar.update(t)
        # preds = preds[preds != 2 and (targets != 2)]
        # for label, meter in self._auc.items():
        #     cur_pred = preds.eq(label)
        #     cur_label = targets.eq(label)
        #     meter.add(cur_pred.data.squeeze(), cur_label.data.squeeze())

    def log(self, msg, level=logging.DEBUG, func=None):
        if func is None:
            loss, acc = self._loss[-1].data[0], self._acc[-1].data[0]
        else:
            loss, acc = func(self._loss).data[0], func(self._acc).data[0]
        self._logger.log(level, "%s: loss: %3.4f, acc: %3.4f", msg, loss, acc)

    @property
    def last_acc(self):
        return self._acc[-1].data[0]

    @property
    def last_loss(self):
        return self._loss[-1].data[0]
def test(segment_size):
    test_list_pub = pickle.load(
        open(os.path.join(MTAT_SPLIT_FOLDER, 'test_list_pub.cP'), 'rb'))
    total_test_size = len(test_list_pub)

    model = local_model(segment_size).cuda()
    model.load_state_dict(
        torch.load(
            os.path.join(ENCODER_FOLDER,
                         'local_model_' + str(segment_size) + '.pt')))
    model.eval()
    auc = AUCMeter()

    for start in range(0, total_test_size, n_songs):
        print("Loading dataset...", start)
        test_features = np.concatenate([
            np.load(
                os.path.join(MTAT_NPY_FOLDER, 'testing/' + test_list_pub[i]))
            for i in range(start, min(start + n_songs, total_test_size))
        ])
        test_labels = np.load(os.path.join(
            MTAT_SPLIT_FOLDER,
            'y_test_pub.npy'))[start:min(start + n_songs, total_test_size)]
        if normalization:
            mean = np.mean(test_features, axis=0)
            var = np.var(test_features, axis=0)
            test_features = (test_features - mean) / np.sqrt(var)

        test_data = CustomDataset(test_features, test_labels)
        test_loader = torch.utils.data.DataLoader(test_data,
                                                  batch_size=batch_size)
        print("Dataset loaded")

        for data, labels in test_loader:
            X = Variable(data).cuda()
            out, _ = model(X)
            auc_out = np.reshape(out.data.cpu().numpy(), -1)
            auc_target = np.reshape(labels, -1)
            auc.add(auc_out, auc_target)

    auc_tuple = auc.value()
    print("AUC = ", auc_tuple[0])
    plt.plot(auc_tuple[2], auc_tuple[1])
    plt.plot([0, 1])
    plt.show()
Exemple #5
0
def evaluate(model, eval_iter, opt):
    model.eval()
    accuracy = []
    threshold = 0.5

    AUC_list = [AUCMeter() for _ in range(opt.label_size)]

    for index, batch in enumerate(eval_iter):
        text = batch.comment_text.data
        label = torch.stack([
            batch.toxic, batch.severe_toxic, batch.obscene,
            batch.threat, batch.insult, batch.identity_hate
        ], dim=1)

        label = label.float()

        # for label (batch_size, classes_size)
        # for text (batch_size, max_seq_len)

        pred = model(text)

        is_class = pred > threshold  # is it is greater than the threshold
        is_class = is_class.float()  # (batch_size, classes_size)

        # for AUC_meter
        pred = torch.nn.functional.sigmoid(pred)
        print(pred)
        print(label)

        for i in range(opt.label_size):
            if opt.use_cuda:
                AUC_list[i].add(
                    output=pred.data.cpu().numpy()[:, i],
                    target=label.data.cpu().numpy()[:, i]
                )
            else:
                AUC_list[i].add(
                    output=pred.data.numpy()[:, i],
                    target=label.data.numpy()[:, i]
                )

        percision = is_class == label # (batch_size, classes_size)
        percision = percision.float()
        percision = percision.mean(dim=0) # (classes_size)

        if opt.use_cuda:
            accuracy.append(percision.data.cpu().numpy())
        else:
            accuracy.append(percision.data.numpy())
    # accuracy ()
    model.train()
    # return (classes_size)
    # AUC
    AUC_scores = [AUC_list[i].value()[0] for i in range(opt.label_size)]
    return np.mean(accuracy, axis=0), AUC_scores # return the mean data, the accuracy for all six classes
Exemple #6
0
    def __init__(self,
                 input_key: str = "targets",
                 output_key: str = "logits",
                 prefix: str = "auc",
                 class_names: List[str] = None,
                 num_classes: int = 1):
        self.prefix = prefix
        self.input_key = input_key
        self.output_key = output_key

        self.class_names = class_names
        self.num_classes = num_classes \
            if class_names is None \
            else len(class_names)

        assert self.num_classes is not None

        self.auc_meters = [AUCMeter() for _ in range(self.num_classes)]
    def __init__(self, r, transform, mode, pred=[], probability=[], log=''):
        self.r = r  # noise ratio
        self.transform = transform
        self.mode = mode
        train_loader, val_loader = get_chexpert_loaders(r, batch_size=32)

        if self.mode == 'test':
            self.test_data = val_loader.get_all_samples()
            self.test_label = val_loader.get_all_real_ground_truth()
        else:
            train_label = train_loader.get_all_real_ground_truth()
            train_data = train_loader.get_all_samples()
            noise_label = train_loader.get_all_labels()

            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            elif self.mode == 'labeled':
                pred_idx = pred.nonzero()[0]
                self.probability = [probability[i] for i in pred_idx]

                clean = (np.array(noise_label) == np.array(train_label))
                auc_meter = AUCMeter()
                auc_meter.reset()
                auc_meter.add(probability, clean)
                auc, _, _ = auc_meter.value()
                log.write('Numer of labeled samples:%d   AUC:%.3f\n' %
                          (pred.sum(), auc))
                log.flush()

                self.train_data = train_data[pred_idx]
                self.noise_label = noise_label[pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_label)))
            elif self.mode == "unlabeled":
                pred_idx = (1 - pred).nonzero()[0]
                self.train_data = train_data[pred_idx]
                self.noise_label = noise_label[pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_label)))
Exemple #8
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log=''): 
        
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        self.transition = {0:0,2:0,4:7,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
     
        if self.mode=='test' or self.mode=='test_average':
            if dataset=='cifar10':                
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['fine_labels']
        
        else:
            train_data = []
            train_label = []
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))

            if self.mode == 'eval' or self.mode == 'eval_average':
                self.eval_data = train_data[45000:]
                self.eval_label = train_label[45000:]

            else:
                if os.path.exists(noise_file):
                    noise_label = json.load(open(noise_file,"r"))
                else:    #inject noise   
                    noise_label = []
                    if self.mode in ['all', 'benchmark_all', 'benchmark_all_average']:
                      size = 50000
                    elif self.mode in ['train', 'benchmark', 'benchmark_average']:
                      size = 45000
                    idx = list(range(size))
                    random.shuffle(idx)
                    num_noise = int(self.r*size)            
                    noise_idx = idx[:num_noise]
                    for i in range(size):
                        if i in noise_idx:
                            if noise_mode=='sym':
                                if dataset=='cifar10': 
                                    noiselabel = random.randint(0,9)
                                elif dataset=='cifar100':    
                                    noiselabel = random.randint(0,99)
                                noise_label.append(noiselabel)
                            elif noise_mode=='asym':   
                                noiselabel = self.transition[train_label[i]]
                                noise_label.append(noiselabel)                    
                        else:    
                            noise_label.append(train_label[i])   
                    print("save noisy labels to %s ..."%noise_file)        
                    json.dump(noise_label,open(noise_file,"w"))       

                if self.mode in ['all', 'benchmark_all', 'benchmark_all_average']:
                    self.train_data = train_data
                    self.noise_label = noise_label
                    self.clean_label = train_label
            
                elif self.mode in ['train', 'benchmark', 'benchmark_average']:
                    self.train_data = train_data[:45000]
                    self.noise_label = noise_label[:45000]
                    self.clean_label = train_label[:45000]
                    
                else:                   
                    if self.mode == "labeled":
                        pred_idx = pred.nonzero()[0]
                        self.probability = [probability[i] for i in pred_idx]
                    
                        clean = (np.array(noise_label)==np.array(train_label))                                                       
                        auc_meter = AUCMeter()
                        auc_meter.reset()
                        auc_meter.add(probability,clean)        
                        auc,_,_ = auc_meter.value()               
                        log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                        log.flush()      
                    
                    elif self.mode == "unlabeled":
                        pred_idx = (1-pred).nonzero()[0]                                             
                
                    self.train_data = train_data[pred_idx]
                    self.noise_label = [noise_label[i] for i in pred_idx]                          
                    print("%s data has a size of %d"%(self.mode,len(self.noise_label)))            
Exemple #9
0
    background_image = Image.fromarray(background)
    foreground = np.uint8(cm.jet(heatmap) * 255)
    heatmap_opacity = foreground[:, :, 3]
    heatmap_opacity[:] = 64
    threshold_prob = min(0.3, heatmap.max() - 0.05)
    heatmap_opacity[heatmap < threshold_prob] = 0
    foreground_image = Image.fromarray(foreground)
    image = Image.alpha_composite(background_image, foreground_image)
    image.load()  # needed for split()
    background = Image.new('RGB', image.size, color)
    background.paste(image, mask=image.split()[3])
    image_array = np.array(background, dtype=np.uint8)
    return image_array


auc_meter = AUCMeter()
conf_meter = ConfusionMeter(options.n_classes)
iterator = get_train_val_iterators(options)
bar = progressbar.ProgressBar()
for batch_idx, data in bar(enumerate(iterator['val']())):
    output = model(Variable(data['input'].cuda(), volatile=True))
    target = data['target'].cpu().numpy()
    prob_tensor = F.softmax(output['classification']).data
    prob = prob_tensor.cpu().numpy()
    heatmap = F.softmax(output['segmentation']).data.cpu().numpy()
    auc_meter.add(prob[:, 1], target)
    conf_meter.add(prob_tensor, data['target'])

    input_images = data['input'].cpu().numpy()
    for i in range(input_images.shape[0]):
        image = np.repeat(input_images[i], 3, axis=0)
Exemple #10
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log='', teacher_idx=None, truncate_mode=None, refinement=None): 
        
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        self.transition = {0:0,2:0,4:7,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
        
        # For distill test
        self.teacher_idx = teacher_idx
        self.truncate_mode = truncate_mode
        self.train_label = None
        self.refinement = refinement
     
        if self.mode=='test':
            if dataset=='cifar10':                
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['labels']
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['fine_labels']                            
        else:    
            train_data=[]
            train_label=[]
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            self.train_label = train_label
            
            if os.path.exists(noise_file):
                noise_label = json.load(open(noise_file,"r"))
            else:    #inject noise   
                fix_seed()
                noise_label = []
                idx = list(range(50000))
                random.shuffle(idx)
                num_noise = int(self.r*50000)            
                noise_idx = idx[:num_noise]
                for i in range(50000):
                    if i in noise_idx:
                        if noise_mode=='sym':
                            if dataset=='cifar10': 
                                noiselabel = random.randint(0,9)
                            elif dataset=='cifar100':    
                                noiselabel = random.randint(0,99)
                            noise_label.append(noiselabel)
                        elif noise_mode=='asym':   
                            noiselabel = self.transition[train_label[i]]
                            noise_label.append(noiselabel)
                    else:    
                        noise_label.append(train_label[i]) 
                print("save noisy labels to %s ..."%noise_file)        
                json.dump(noise_label,open(noise_file,"w"))       

            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
                if self.truncate_mode == 'initial':
                    self.train_data = self.train_data[teacher_idx]
                    self.noise_label = [noise_label[i] for i in teacher_idx]
            else:                   
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]
                    if self.truncate_mode == 'initial':
                        pred_idx = pred_idx.tolist()
                        teacher_idx = teacher_idx.tolist()
                        pred_idx = list(set(pred_idx) & set(teacher_idx))
                        pred_idx = torch.tensor(pred_idx)
                    
                    self.probability = [probability[i] for i in pred_idx]   
                    
                    clean = (np.array(noise_label)==np.array(train_label))                                                       
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability,clean)        
                    auc,_,_ = auc_meter.value()      
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                    log.flush()      
                    
                elif self.mode == "unlabeled":
                    pred_idx = (1-pred).nonzero()[0]
                    if self.truncate_mode == 'initial':
                        whole_idx = list(range(50000))
                        pred_idx = pred_idx.tolist()
                        teacher_idx = teacher_idx.tolist()
                        tmp_set = set(whole_idx) - set(teacher_idx)
                        tmp_set = tmp_set | set(pred_idx)
                        pred_idx = torch.tensor(list(tmp_set))
                    
                elif self.mode == "labeled_svd":
                    if self.refinement:
                        pred_idx = pred.nonzero()[0]
                        pred_idx_set = set(pred_idx.tolist())
                        teacher_idx_set = set(teacher_idx.tolist())
                        pred_idx = torch.tensor(list(pred_idx_set & teacher_idx_set))
                        self.probability = [probability[i] for i in pred_idx]
                        
                        clean = (np.array(noise_label)==np.array(train_label))
                        auc_meter = AUCMeter()
                        auc_meter.reset()
                        auc_meter.add(probability,clean)        
                        auc,_,_ = auc_meter.value()               
                        log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                        log.flush()
                    else:
                        pred_idx = teacher_idx
                        probability = torch.ones(50000,)
                        self.probability = [probability[i] for i in pred_idx]

                        log.write('Number of labeled samples (by svd) : %d' % teacher_idx.shape[0])
                
                elif self.mode == "unlabeled_svd":
                    if self.refinement:
                        clean_pred_idx = pred.nonzero()[0]
                        clean_pred_idx_set = set(clean_pred_idx.tolist())
                        teacher_idx_set = set(teacher_idx.tolist())
                        all_idx_set = set(range(50000))
                        pred_idx = torch.tensor(list(all_idx_set - (clean_pred_idx_set & teacher_idx_set)))                    
                    else:
                        pred_idx = torch.arange(0, 50000)
                        pred_idx_set = set(pred_idx.tolist()) - set(teacher_idx.tolist())
                        pred_idx = torch.tensor(list(pred_idx_set))
                
                self.train_data = train_data[pred_idx]
                self.noise_label = [noise_label[i] for i in pred_idx]
                print("%s data has a size of %d"%(self.mode,len(self.noise_label)))
Exemple #11
0
    model = model_data["model"]
    loss_fn = model_data["criterion"]
    optim = model_data["optim"]

    nb_data_test = 300
    nb_canaux = 3

    data_root = "./data/"

    data = load_preprocess_data(data_root, 0, nb_data_test)

    signals = data["signals"]
    target = data["targets"]

    res = np.zeros(target.shape)
    auc_c0 = AUCMeter()
    auc_c1 = AUCMeter()
    auc_c2 = AUCMeter()

    print("Testing model...")
    model.eval()
    for i in tqdm(range(nb_data_test)):
        out = model(th.Tensor(signals[None, i, :, :])).detach().numpy()

        auc_c0.add(out[None, 0, 0], target[None, i, 0])
        auc_c1.add(out[None, 0, 1], target[None, i, 1])
        auc_c2.add(out[None, 0, 2], target[None, i, 2])

        res[i, 0] = 1 if out[0, 0] > 0.5 else -1
        res[i, 1] = 1 if out[0, 1] > 0.5 else -1
        res[i, 2] = 1 if out[0, 2] > 0.5 else -1
    def __init__(self,
                 dataset,
                 noisy_dataset,
                 r,
                 on,
                 noise_mode,
                 root_dir,
                 noise_data_dir,
                 transform,
                 mode,
                 noise_file='',
                 pred=[],
                 probability=[],
                 log='',
                 targets=None):

        self.r = r  # total noise ratio
        self.on = on  # proportion of open noise
        self.transform = transform
        self.mode = mode
        self.transition = {
            0: 0,
            2: 0,
            4: 7,
            7: 7,
            1: 1,
            9: 1,
            3: 5,
            5: 3,
            6: 6,
            8: 8
        }  # class transition for asymmetric noise
        self.open_noise = None
        self.closed_noise = None

        if self.mode == 'test':
            if dataset == 'cifar10':
                test_dic = unpickle('%s/test_batch' % root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset == 'cifar100':
                test_dic = unpickle('%s/test' % root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['fine_labels']

        elif self.mode == 'clean':
            if not os.path.exists(noise_file):
                print('Noise not defined')
                return

            if self.open_noise is None or self.closed_noise is not None:
                noise = json.load(open(noise_file, "r"))
                noise_labels = noise['noise_labels']
                self.open_noise = noise['open_noise']
                self.closed_noise = noise['closed_noise']

            train_data = []
            train_label = []
            noise_data = []
            if dataset == 'cifar10':
                for n in range(1, 6):
                    dpath = '%s/data_batch_%d' % (root_dir, n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label + data_dic['labels']
                train_data = np.concatenate(train_data)
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            open_noise = [item[0] for item in self.open_noise]
            clean_indices = list(
                set(range(50000)) - set(open_noise) - set(self.closed_noise))
            self.clean_data = train_data[clean_indices]
            self.clean_label = np.asarray(train_label)[clean_indices]

        else:
            train_data = []
            train_label = []
            noise_data = []
            if dataset == 'cifar10':
                for n in range(1, 6):
                    dpath = '%s/data_batch_%d' % (root_dir, n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label + data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset == 'cifar100':
                train_dic = unpickle('%s/train' % root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            if noisy_dataset == 'imagenet32':
                noise_data = None
            else:
                noise_data = unpickle(
                    '%s/train' % noise_data_dir)['data'].reshape(
                        (50000, 3, 32, 32)).transpose((0, 2, 3, 1))

            if os.path.exists(noise_file):
                noise = json.load(open(noise_file, "r"))
                noise_labels = noise['noise_labels']
                self.open_noise = noise['open_noise']
                self.closed_noise = noise['closed_noise']
                for cleanIdx, noisyIdx in noise['open_noise']:
                    if noisy_dataset == 'imagenet32':
                        train_data[cleanIdx] = np.asarray(
                            Image.open('{}/{}.png'.format(
                                noise_data_dir,
                                str(noisyIdx + 1).zfill(7)))).reshape(
                                    (32, 32, 3))
                    else:
                        train_data[cleanIdx] = noise_data[noisyIdx]
            else:
                #inject noise
                noise_labels = []  # all labels (some noisy, some clean)
                idx = list(range(50000))  # indices of cifar dataset
                random.shuffle(idx)
                num_total_noise = int(self.r * 50000)  # total amount of noise
                num_open_noise = int(
                    self.on *
                    num_total_noise)  # total amount of noisy/openset images
                if noisy_dataset == 'imagenet32':  # indices of openset source images
                    target_noise_idx = list(range(1281149))
                else:
                    target_noise_idx = list(range(50000))
                random.shuffle(target_noise_idx)
                self.open_noise = list(
                    zip(idx[:num_open_noise], target_noise_idx[:num_open_noise]
                        ))  # clean sample -> openset sample mapping
                self.closed_noise = idx[
                    num_open_noise:num_total_noise]  # closed set noise indices
                # populate noise_labels
                for i in range(50000):
                    if i in self.closed_noise:
                        if noise_mode == 'sym':
                            if dataset == 'cifar10':
                                noiselabel = random.randint(0, 9)
                            elif dataset == 'cifar100':
                                noiselabel = random.randint(0, 99)
                            noise_labels.append(noiselabel)
                        elif noise_mode == 'asym':
                            noiselabel = self.transition[train_label[i]]
                            noise_labels.append(noiselabel)
                    else:
                        noise_labels.append(train_label[i])
                # populate openset noise images
                for cleanIdx, noisyIdx in self.open_noise:
                    if noisy_dataset == 'imagenet32':
                        train_data[cleanIdx] = np.asarray(
                            Image.open('{}/{}.png'.format(
                                noise_data_dir,
                                str(noisyIdx + 1).zfill(7)))).reshape(
                                    (32, 32, 3))
                    else:
                        train_data[cleanIdx] = noise_data[noisyIdx]
                # write noise to a file, to re-use
                noise = {
                    'noise_labels': noise_labels,
                    'open_noise': self.open_noise,
                    'closed_noise': self.closed_noise
                }
                print("save noise to %s ..." % noise_file)
                json.dump(noise, open(noise_file, "w"))

            if self.mode == 'all':
                self.train_data = train_data
                if targets is None:
                    self.noise_labels = noise_labels
                else:
                    self.noise_labels = targets
            else:
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]
                    self.probability = [probability[i] for i in pred_idx]

                    clean = (np.array(noise_labels) == np.array(train_label))
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability, clean)
                    # note: If all the labels are clean, the following will return NaN
                    auc, _, _ = auc_meter.value()

                elif self.mode == "unlabeled":
                    pred_idx = pred.nonzero()[0]

                self.train_data = train_data[pred_idx]
                self.noise_labels = [noise_labels[i] for i in pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_labels)))
Exemple #13
0
 def __init__(self, num_class):
     super().__init__()
     self.num_class = num_class
     self.meters: List[AUCMeter] = [
         AUCMeter() for k in range(self.num_class)
     ]
Exemple #14
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log=''):
        # mode
            # Test      : Test
            # All       : All
            # Labeled   : Labeled
            # UnLabeled : UnLabeled
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        self.transition = {0:0,2:0,4:4,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
     
        if self.mode=='test':
            if dataset=='cifar10':
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['fine_labels']
        else:    
            train_data=[]
            train_label=[]
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))

            # Noise Label 생성
            if os.path.exists(noise_file):
                noise_label = json.load(open(noise_file,"r"))
            else:    #inject noise
                noise_label = []
                idx = list(range(50000))
                random.shuffle(idx)
                # num_noise = int(self.r*50000) -> 순 사기꾼이여ㅡㅡ
                # noise_idx = idx[:num_noise]
                noise_idx = idx[:]

                num_classes = 10 if dataset == 'cifar10' else 100

                if noise_mode == 'sym':
                    C = uniform_mix_C(self.r, num_classes)
                    # if dataset=='cifar10':
                    #     noiselabel = random.randint(0,9)
                    # elif dataset=='cifar100':
                    #     noiselabel = random.randint(0,99)
                    # noise_label.append(noiselabel)
                elif noise_mode == 'asym':
                    C = flip_labels_C(self.r, num_classes)

                for i in range(50000):
                    if i in noise_idx:
                        noiselabel = np.random.choice(num_classes, p=C[train_label[i]])
                        noise_label.append(noiselabel)
                    else:    
                        noise_label.append(train_label[i])   
                print("save noisy labels to %s ..."%noise_file)        
                json.dump(noise_label,open(noise_file,"w"))       


            # 전체 부분
            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            else:
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0] # 4770
                    self.probability = [probability[i] for i in pred_idx] # 4770
                    
                    clean = (np.array(noise_label)==np.array(train_label)) # 39981
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability,clean)
                    auc,_,_ = auc_meter.value()
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                    log.flush()
                    
                elif self.mode == "unlabeled":
                    pred_idx = (1-pred).nonzero()[0] # 45230
                
                self.train_data = train_data[pred_idx]
                self.noise_label = [noise_label[i] for i in pred_idx]                          
                print("%s data has a size of %d"%(self.mode,len(self.noise_label)))            
Exemple #15
0
def test_vgg16():
    parser = argparse.ArgumentParser("Test VGG16 Main")
    parser.add_argument("-d",
                        "--data-path",
                        type=str,
                        required=True,
                        dest="data_path")
    parser.add_argument("-l",
                        "--label-path",
                        type=str,
                        required=True,
                        dest="label_path")
    parser.add_argument("-m",
                        "--model-path",
                        type=str,
                        required=True,
                        dest="model_path")

    args = parser.parse_args()

    data_path = args.data_path
    label_path = args.label_path
    model_path = args.model_path

    # Test if numpy data and labels exist
    if not exists(data_path):
        raise FileNotFoundError(
            "Numpy data file doesn't exist ({}) !".format(data_path))
    if not exists(label_path):
        raise FileNotFoundError(
            "Numpy label file doesn't exist ({}) !".format(label_path))
    # Test if model save file exist
    if not exists(model_path):
        raise FileNotFoundError(
            "Model state dict file doesn't exist ({}) !".format(model_path))

    print("Load model...")
    # Load model
    vgg16 = get_vgg16_modified()
    vgg16.load_state_dict(th.load(model_path))
    vgg16.cuda()
    vgg16.eval()

    # Create AUC Meter
    auc_meter = AUCMeter()

    print("Load data...")
    # Load data
    data = np.load(data_path)
    labels = np.load(label_path)

    # Split eval
    nb_split = int(data.shape[0] * train_ratio)
    data = data[nb_split:]
    labels = labels[nb_split:]

    batch_size = 32
    nb_batch = ceil(data.shape[0] / batch_size)

    # Loop on eval data
    for i_b in tqdm(range(nb_batch)):
        # Get batch indexes
        i_min = i_b * batch_size
        i_max = (i_b + 1) * batch_size
        i_max = i_max if i_max < data.shape[0] else data.shape[0]

        # Slice data to get batch
        batch = data[i_min:i_max, :, :, :]
        batch = batch.transpose(0, 3, 1, 2)
        batch = th.tensor(batch).cuda().float() / 255.

        # And labels
        batch_label = th.tensor(labels[i_min:i_max]).cuda().float()

        # Forward - Inférence
        out = vgg16(batch).squeeze(1)

        # Update metric
        auc_meter.add(out.cpu().detach(), batch_label.cpu().detach())

    print("AUC value = {}".format(auc_meter.value()[0]))
Exemple #16
0
    simulator.train()
    for epoch in range(4):
        print("---epoch {}---".format(epoch))
        for step, batch in enumerate(train_loader):
            feats, labels = batch
            logits = simulator(**feats)
            loss = criterion(logits, labels)

            opt.zero_grad()
            loss.backward()
            opt.step()

            if (step + 1) % 500 == 0:
                with torch.no_grad():
                    simulator.eval()
                    auc = AUCMeter()
                    for feats, labels in val_loader:
                        outputs = torch.sigmoid(simulator(**feats))
                        auc.add(outputs, labels)
                    print(step, auc.value()[0])
                    if auc.value()[0] > 0.735:
                        break
                simulator.train()

    simulator.to("cpu")
    torch.save(simulator.state_dict(), simulator_path)

# create a torch dataset class that adopt the simulator and generate the synthetic dataset
synthetic_data_path = os.path.join(filepath, "full_impression_feats.pt")
syn = SyntheticMovieLensDataset(filepath,
                                simulator_path,
Exemple #17
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log=''): 
        
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        #self.transition = {0:0,2:0,4:7,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
        #self.transition = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 9: 9, 9: 0}  # 十类
        self.transition = {0: 1, 1: 0} # 两类

        if self.mode=='test':
            if dataset=='cifar10':                
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['labels']
                if noise_mode == 'asym_two_unbalanced_classes':
                    for i in range(len(self.test_label)):
                        if self.test_label[i] != 1:
                            self.test_label[i] = 0
                #print("self.test_label=",self.test_label)
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['fine_labels']                            
        else:    
            train_data=[]
            train_label=[]
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
                if noise_mode == 'asym_two_unbalanced_classes':
                    for i in range(len(train_label)):
                        if train_label[i] != 1:
                            train_label[i] = 0
                #print("train_label=",train_label)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))

            #每次重新获得第一次初始化的数据
            if os.path.exists(noise_file):
                noise_label = json.load(open(noise_file,"r"))
            else:    #inject noise
                noise_label = []
                idx = list(range(50000))
                random.shuffle(idx)
                #num_noise = int(self.r*50000)
                if noise_mode == 'sym':
                    num_noise = int((self.r / 9) / ( 1-self.r +  self.r / 9 ) * 50000)
                else:
                    num_noise = int(self.r * 50000)
                noise_idx = idx[:num_noise]
                for i in range(50000):
                    if i in noise_idx:
                        if noise_mode=='sym':
                            if dataset=='cifar10':
                                    noiselabel = random.randint(0, 9)
                                    #print("noiselabel=",noiselabel)
                                    #print("train_label[i]=",train_label[i])
                                    while noiselabel == train_label[i]:
                                        noiselabel = random.randint(0,9)
                            elif dataset=='cifar100':    
                                noiselabel = random.randint(0,99)
                            noise_label.append(noiselabel)
                        elif noise_mode=='asym_two_unbalanced_classes':
                            noiselabel = self.transition[train_label[i]]
                            noise_label.append(noiselabel)                    
                    else:    
                        noise_label.append(train_label[i])   
                print("save noisy labels to %s ..."%noise_file)        
                json.dump(noise_label,open(noise_file,"w"))       
            
            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            else:                   
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]#二维数组,返回第几个samples,[0]表示第几行
                    self.probability = [probability[i] for i in pred_idx]   
                    
                    clean = (np.array(noise_label)==np.array(train_label))                                                       
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability,clean)        
                    auc,_,_ = auc_meter.value()               
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                    log.flush()      
                    
                elif self.mode == "unlabeled":
                    pred_idx = (1-pred).nonzero()[0]                                               
                
                self.train_data = train_data[pred_idx]# 每次初始化的时候 这里面是有label的数据 或者 无label的数据
                self.noise_label = [noise_label[i] for i in pred_idx]   #每次初始化的时候 这里面是有label的数据 或者 无label的数据
                print("%s data has a size of %d"%(self.mode,len(self.noise_label)))            
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

from asteroid_dataset import *
import torch.optim as optim
from classifier import *
import visdom
from torchnet.meter import ConfusionMeter
from torchnet.meter import AUCMeter

from sklearn.metrics import matthews_corrcoef

confusion_matrix = ConfusionMeter(2)
# temp_confusion_matrix = ConfusionMeter(2)
auc_meter = AUCMeter()
# confusion_matrix_validation = ConfusionMeter(2)
vis = visdom.Visdom()
draw_graph = None
draw_accuracy = None
draw_roc_curve = None

csv_file = "classifications.csv"
root_dir = "data/"
# hyperparameters
batch_size = 159
learning_rate = 0.001
epoch_num = 50

# experiment parameters
real_exp = True
Exemple #19
0
 def __init__(self, device):
     self.device = device
     # super(AllInOneMeter, self).__init__()
     self.out1auc1 = AUCMeter()
     self.out1auc2 = AUCMeter()
     self.out1auc3 = AUCMeter()
     self.out1auc4 = AUCMeter()
     self.out1auc5 = AUCMeter()
     self.out2auc1 = AUCMeter()
     self.out2auc2 = AUCMeter()
     self.out2auc3 = AUCMeter()
     self.out2auc4 = AUCMeter()
     self.out2auc5 = AUCMeter()
     self.loss1 = []
     self.loss2 = []
     self.loss3 = []
     self.loss = []
     self.jaccard = []
     # self.nbatch = 0
     self.intersection = torch.zeros([5],
                                     dtype=torch.float,
                                     device=self.device)
     self.union = torch.zeros([5], dtype=torch.float, device=self.device)
     self.reset()
Exemple #20
0
class AllInOneMeter(object):
    """
    All in one meter: AUC
    """
    def __init__(self, device):
        self.device = device
        # super(AllInOneMeter, self).__init__()
        self.out1auc1 = AUCMeter()
        self.out1auc2 = AUCMeter()
        self.out1auc3 = AUCMeter()
        self.out1auc4 = AUCMeter()
        self.out1auc5 = AUCMeter()
        self.out2auc1 = AUCMeter()
        self.out2auc2 = AUCMeter()
        self.out2auc3 = AUCMeter()
        self.out2auc4 = AUCMeter()
        self.out2auc5 = AUCMeter()
        self.loss1 = []
        self.loss2 = []
        self.loss3 = []
        self.loss = []
        self.jaccard = []
        # self.nbatch = 0
        self.intersection = torch.zeros([5],
                                        dtype=torch.float,
                                        device=self.device)
        self.union = torch.zeros([5], dtype=torch.float, device=self.device)
        self.reset()

    def reset(self):
        # self.scores = torch.DoubleTensor(torch.DoubleStorage()).numpy()
        # self.targets = torch.LongTensor(torch.LongStorage()).numpy()
        self.out1auc1.reset()
        self.out1auc2.reset()
        self.out1auc3.reset()
        self.out1auc4.reset()
        self.out1auc5.reset()
        self.out2auc1.reset()
        self.out2auc2.reset()
        self.out2auc3.reset()
        self.out2auc4.reset()
        self.out2auc5.reset()
        self.loss1 = []
        self.loss2 = []
        self.loss3 = []
        self.loss = []
        self.jaccard = []
        self.intersection = torch.zeros([5],
                                        dtype=torch.float,
                                        device=self.device)
        self.union = torch.zeros([5], dtype=torch.float, device=self.device)
        # self.nbatch = 0

    def add(self, mask_prob, true_mask, mask_ind_prob1, mask_ind_prob2,
            true_mask_ind, loss1, loss2, loss3, loss):
        self.out1auc1.add(mask_ind_prob1[:, 0].data, true_mask_ind[:, 0].data)
        self.out1auc2.add(mask_ind_prob1[:, 1].data, true_mask_ind[:, 1].data)
        self.out1auc3.add(mask_ind_prob1[:, 2].data, true_mask_ind[:, 2].data)
        self.out1auc4.add(mask_ind_prob1[:, 3].data, true_mask_ind[:, 3].data)
        self.out1auc5.add(mask_ind_prob1[:, 4].data, true_mask_ind[:, 4].data)
        self.out2auc1.add(mask_ind_prob2[:, 0].data, true_mask_ind[:, 0].data)
        self.out2auc2.add(mask_ind_prob2[:, 1].data, true_mask_ind[:, 1].data)
        self.out2auc3.add(mask_ind_prob2[:, 2].data, true_mask_ind[:, 2].data)
        self.out2auc4.add(mask_ind_prob2[:, 3].data, true_mask_ind[:, 3].data)
        self.out2auc5.add(mask_ind_prob2[:, 4].data, true_mask_ind[:, 4].data)
        self.loss1.append(loss1)
        self.loss2.append(loss2)
        self.loss3.append(loss3)
        self.loss.append(loss)
        # self.nbatch += true_mask.shape[0]
        y_pred = (mask_prob > 0.3).type(true_mask.dtype)
        y_true = true_mask
        self.intersection += (y_pred *
                              y_true).sum(dim=-2).sum(dim=-1).sum(dim=0)
        self.union += y_true.sum(dim=-2).sum(dim=-1).sum(dim=0) + y_pred.sum(
            dim=-2).sum(dim=-1).sum(dim=0)

    def value(self):
        jaccard_array = (self.intersection / (self.union - self.intersection))
        # jaccard_array = jaccard_array.data.cpu().numpy()
        jaccard = jaccard_array.mean()
        metrics = {
            'out1auc1': self.out1auc1.value()[0],
            'out1auc2': self.out1auc2.value()[0],
            'out1auc3': self.out1auc3.value()[0],
            'out1auc4': self.out1auc4.value()[0],
            'out1auc5': self.out1auc5.value()[0],
            'out2auc1': self.out2auc1.value()[0],
            'out2auc2': self.out2auc2.value()[0],
            'out2auc3': self.out2auc3.value()[0],
            'out2auc4': self.out2auc4.value()[0],
            'out2auc5': self.out2auc5.value()[0],
            'loss1': np.mean(self.loss1),
            'loss2': np.mean(self.loss2),
            'loss3': np.mean(self.loss3),
            'loss': np.mean(self.loss),
            'jaccard': jaccard.item(),
            'jaccard1': jaccard_array[0].item(),
            'jaccard2': jaccard_array[1].item(),
            'jaccard3': jaccard_array[2].item(),
            'jaccard4': jaccard_array[3].item(),
            'jaccard5': jaccard_array[4].item(),
        }
        return metrics
def test(segment_size_list):
    if test_dataset == 'MTAT':
        test_list_pub = pickle.load(open(os.path.join(MTAT_SPLIT_FOLDER, 'test_list_pub.cP'), 'rb'))
    if test_dataset == 'MSD':
        id7d_to_path = pickle.load(open(os.path.join(MSD_SPLIT_FOLDER, '7D_id_to_path.pkl'), 'rb'))
        idmsd_to_id7d = pickle.load(
            open(os.path.join(MSD_SPLIT_FOLDER, 'MSD_id_to_7D_id.pkl'), 'rb'))
        test_list_pub_id = pickle.load(
            open(os.path.join(MSD_SPLIT_FOLDER, 'filtered_list_test.cP'), 'rb'))
        test_list_pub = [id7d_to_path[idmsd_to_id7d[song]][:-9] + '.npy' for song in test_list_pub_id]
        del id7d_to_path, idmsd_to_id7d

    total_test_size = len(test_list_pub)

    n_inputs = 0
    for segment_size in segment_size_list:
        if segment_size == 18:
            n_inputs += 512
        if segment_size == 27:
            n_inputs += 512
        if segment_size == 54:
            n_inputs += 768
        if segment_size == 108:
            n_inputs += 1024
        if segment_size == 216:
            n_inputs += 1280

    local_models = []
    for segment_size in segment_size_list:
        loc_model = local_model(segment_size).cuda()
        loc_model.load_state_dict(os.path.join(ENCODER_FOLDER, torch.load('local_model_' + str(segment_size) + '.pt')))
        loc_model.eval()
        local_models.append(loc_model)
    model = global_model(n_inputs, 512).cuda()
    model.load_state_dict(torch.load(os.path.join(ENCODER_FOLDER, 'global_model_18_27_54_9051_123.pt')))
    model.eval()
    auc = AUCMeter()

    for start in range(0, total_test_size, n_songs):
        print("Loading dataset...", start)
        if test_dataset == 'MTAT':
            test_features = np.concatenate(
                [np.load(os.path.join(MTAT_NPY_FOLDER, 'testing/' + test_list_pub[i])) for i in
                 range(start, min(start + n_songs, total_test_size))])
            test_labels = np.load(
                os.path.join(MTAT_SPLIT_FOLDER, 'y_test_pub.npy'))[start:min(start + n_songs, total_test_size)]
        if test_dataset == 'MSD':
            test_features = np.concatenate(
                [np.expand_dims(np.load(os.path.join(MSD_NPY_FOLDER, 'testing/' + test_list_pub[i]))[:, :1255], axis=0)
                 for i in range(start, min(start + n_songs, total_test_size))])
            idmsd_to_tag = pickle.load(
                open(os.path.join(MSD_SPLIT_FOLDER, 'msd_id_to_tag_vector.cP'), 'rb'))
            test_labels = np.concatenate(
                [idmsd_to_tag[idmsd] for idmsd in test_list_pub_id[start:min(start + n_songs, total_test_size)]],
                axis=1)

        if normalization:
            mean = np.mean(test_features, axis=0)
            var = np.var(test_features, axis=0)
            test_features = (test_features - mean) / np.sqrt(var)

        test_data = CustomDataset(test_features, test_labels)
        test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)
        print("Dataset loaded")

        for data, labels in test_loader:
            X = Variable(data).cuda()
            X = torch.cat([loc_model(X)[1] for loc_model in local_models], dim=1)
            out, _ = model(X)
            auc_out = np.reshape(out.data.cpu().numpy(), -1)
            auc_target = np.reshape(labels, -1)
            auc.add(auc_out, auc_target)

        del test_features, test_labels, test_data, test_loader

    auc_tuple = auc.value()
    print("AUC = ", auc_tuple[0])
Exemple #22
0
    def __init__(self,
                 dataset,
                 r,
                 noise_mode,
                 root_dir,
                 transform,
                 mode,
                 noise_file='',
                 clean_file='',
                 pred=[],
                 probability=[],
                 log=''):

        self.r = r  # noise ratio
        self.transform = transform
        self.noise_mode = noise_mode
        self.mode = mode
        self.transition = {
            0: 0,
            2: 0,
            4: 7,
            7: 7,
            1: 1,
            9: 1,
            3: 5,
            5: 3,
            6: 6,
            8: 8
        }  # class transition for asymmetric noise

        if self.mode == 'test':
            if dataset == 'cifar10':
                test_dic = unpickle('%s/data/cifar-10-batches-py/test_batch' %
                                    root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset == 'cifar100':
                test_dic = unpickle('%s/data/cifar-100-python/test' % root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['fine_labels']
        else:
            train_data = []
            train_label = []
            if dataset == 'cifar10':
                #print("current path is {}".format(sys.path[0]))
                for n in range(1, 6):
                    dpath = '%s/data/cifar-10-batches-py/data_batch_%d' % (
                        root_dir, n)
                    #print("path is {}".format(dpath))
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label + data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset == 'cifar100':
                train_dic = unpickle('%s/data/cifar-100-python/train' %
                                     root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            train_label = np.array(train_label)
            noise_label = train_label.copy()
            if dataset == 'cifar10':
                nb_classes = 10
            elif dataset == 'cifar100':
                nb_classes = 100
            clean_per_class = int(5000 / nb_classes)  # cifar10: 100 else: 10
            noise_per_class = int(50000 / nb_classes * r)

            #select clean_per_class numbers of data in each class as clean data
            #leave the other data to add noise
            #the 0th data processing is at the outer loop
            #0th add noise (for index)
            all_index = np.arange(50000).reshape(-1)
            clean_indices = all_index[np.where(
                train_label == 0)[0]][-clean_per_class:]
            noise_idx = [
                all_index[np.where(train_label == 0)[0]][:-clean_per_class]
            ]
            #from 1th to 9th to add noise (for index)
            for i in range(nb_classes - 1):
                indices1 = all_index[np.where(train_label == i +
                                              1)[0]][-clean_per_class:]
                noisy_indices1 = all_index[np.where(train_label == i +
                                                    1)[0]][:-clean_per_class]
                clean_indices = np.concatenate((clean_indices, indices1))
                noise_idx.append(noisy_indices1)
            #add noise
            for t, i in enumerate(noise_idx):
                # randomly selected one image as the center
                image_center = train_data[i[10]]
                norm_loss = np.zeros(len(i))
                for j, k in enumerate(i):
                    images = train_data[k]
                    norm_loss[j] = np.linalg.norm(image_center - images)
                noisy_indices = i[norm_loss.argsort()[:noise_per_class]]
                noise_label[noisy_indices] = (t + 1) % nb_classes

            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            elif self.mode == 'small':
                self.train_data = train_data[::100]
                self.noise_label = noise_label[::100]
            else:
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]
                    self.probability = [probability[i] for i in pred_idx]

                    #clean = (np.array(noise_label)==np.array(train_label))
                    clean = (noise_label == train_label)
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability, clean)
                    auc, _, _ = auc_meter.value()
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n' %
                              (pred.sum(), auc))
                    log.flush()

                elif self.mode == "unlabeled":
                    pred_idx = (1 - pred).nonzero()[0]

                self.train_data = train_data[pred_idx]
                self.noise_label = noise_label[pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_label)))