def write_dataset_info(ds, active_indices, normal_indices, filename): active_els = [0] * 10 normal_els = [0] * 10 dataloader_1 = tud.DataLoader(ds._train_val_set, batch_size=1, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler( [x for x in active_indices])) dataloader_2 = tud.DataLoader(ds._train_val_set, batch_size=1, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler( [x for x in normal_indices])) with torch.no_grad(): for b, (input, target, i) in enumerate(dataloader_1): active_els[target.item()] += 1 for b, (input, target, i) in enumerate(dataloader_2): normal_els[target.item()] += 1 with open(filename + "_datainfo.csv", "a") as csvfile: writer = csv.writer(csvfile) writer.writerow([active_els[i] for i in range(len(active_els))] + [""] + [normal_els[i] for i in range(len(normal_els))])
def generate_weak_labels(net, cds, indices, howmany, train_indices, n=5): net.eval() normalized_confidence = [torch.Tensor().to("cuda:0"), torch.Tensor().long()] randomized_list = numpy.random.choice([x for x in indices], len(indices), replace=False) dataloaders = [tud.DataLoader(cds.train_indices, batch_size=500, shuffle=False, num_workers=4, sampler=customcifar.CustomSampler(randomized_list)) for i in range(n)] with torch.no_grad(): for batch_index, element in enumerate(zip(*dataloaders)): # unlabelled samples normalized_confidence[1] = torch.cat((normalized_confidence[1], element[0][2]), 0) els = [x for x in element] o = torch.Tensor().to("cuda:0") predictions = torch.Tensor().long() for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") output = net(input[0]) # out = output[1].reshape(len(input[0]), 512, 1) # o = torch.cat((o, out), 2) predictions = torch.cat((predictions, output[0].max(1)[1].reshape(len(output[0]), 1).cpu()), 1) print(predictions) normalized_confidence[0] = torch.cat((normalized_confidence[0].cpu(), 1 - torch.Tensor( acquisition_functions.confidence(predictions.transpose(0,1), details=True)).cpu() / n), 0).cpu() print(normalized_confidence)
def entropy(self, ds, indices, howmany): tots = len(indices) self.net.eval() list_of_errors = [] randomized_list = numpy.random.choice([x for x in indices], len(indices), replace=False) dataloaders = [tud.DataLoader(ds._train_val_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler(randomized_list)) for i in range(5)] with torch.no_grad(): for batch_index, element in enumerate(zip(*dataloaders)): els = [x for x in element] for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") outputs = [self.net(i[0]) for i in els] confidence = acquisition_functions.avg_entropy(outputs) for x in range(len(confidence)): list_of_errors.append([confidence[x], els[0][2][x].item()]) # print(list_of_errors) print("\r Checked: {0} / {1}".format(len(list_of_errors), tots), end='') sorlist = sorted(list_of_errors, key=lambda xp: xp[0], reverse=True) return [el[1] for el in sorlist[:howmany]]
def greedy_k_centers(self, ds, indices, howmany, _train_loader, n=5): self.kl_divergence(ds, indices, howmany, _train_loader) self.net.eval() N = torch.Tensor().to("cuda:0") S = torch.Tensor().to("cuda:0") randomized_list = numpy.random.choice([x for x in indices], len(indices), replace=False) dataloader = tud.DataLoader(ds._train_val_set, batch_size=500, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler(randomized_list)) with torch.no_grad(): for batch_index, (inputs, targets, index) in enumerate(_train_loader): inputs, targets = inputs.to("cuda:0"), targets.to("cuda:0") outputs = self.net(inputs)[0] N = torch.cat((N, outputs), 0) for batch_index, (inputs, targets, index) in enumerate(dataloader): # x = arg max(i in S/N) min(j in N) d(X_i, X_j) inputs, targets = inputs.to("cuda:0"), targets.to("cuda:0") outputs = self.net(inputs)[0] S = torch.cat((S, outputs), 0) differences = S.to("cpu").unsqueeze(1) - N.to("cpu").unsqueeze(0) print(differences.size()) dist_m = torch.sum(differences * differences, -1).pow(.5) mindist = [x for x in zip(randomized_list, torch.min(dist_m.to("cuda:0"), 1)[0].to("cpu").data)] sorlist = sorted(mindist, key=lambda xp: xp[1].item(), reverse=True) print(sorlist) return [x[0] for x in sorlist[:howmany]]
def all_train(self, otherDS=None, excluded=[]): if otherDS is None: return tud.DataLoader(self._train_val_set, batch_size=1, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler([ x for x in self.train_indices if x not in excluded ])) else: return tud.DataLoader(otherDS, batch_size=1, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler([ x for x in self.train_indices if x not in excluded ]))
def __init__(self, transform=None, first_time_multiplier=1, name=None, joking=False): if joking: return self._train_val_set = customcifar.UnbalancedCIFAR10( root="./cifar", train=True, download=True, transform=transform, filename=name, percentage=.1) self._test_set = customcifar.UnbalancedCIFAR10( root="./cifar", train=False, download=True, transform=transform) # 10000 self.validation_indices = self._train_val_set._val_indices self.train_indices = [ x for x in self._train_val_set.indices if x not in self.validation_indices ] self.already_selected_indices = numpy.random.choice( self.train_indices, size=tslp * first_time_multiplier, replace=False).tolist() self._train = tud.DataLoader(self._train_val_set, batch_size=train_batch_size, shuffle=False, num_workers=2, sampler=customcifar.CustomRandomSampler( self.already_selected_indices)) self._v = tud.DataLoader(self._train_val_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomRandomSampler( self.validation_indices)) self._t = torch.utils.data.DataLoader( self._test_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler( [x for x in range(len((self._test_set)))]))
def bestofn(self, ds, indices, howmany, n=5): self.net.eval() total_normalized_confidence = 0 total = 0 list_of_errors = [] errors_by_class = [0 for x in range(10)] printiter = 0 randomized_list = numpy.random.choice([x for x in indices], len(indices), replace=False) dataloaders = [tud.DataLoader(ds._train_val_set, batch_size=500, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler(randomized_list)) for i in range(n)] with torch.no_grad(): for batch_index, element in enumerate(zip(*dataloaders)): els = [x for x in element] for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") res_net = [self.net(i[0]) for i in els] outputs = [i[0] for i in res_net] intrep = [i[1] for i in res_net][0] # intrep = torch.cat((intrep, [i[1] for i in res_net][0]), 0) predictions = [out.max(1)[1] for out in outputs] normalized_confidence = [float(c/n) for c in acquisition_functions.confidence(predictions)] differences = intrep.unsqueeze(1) - intrep.unsqueeze(0) dist_m = torch.sum(differences * differences, -1).pow(.5) for x in range(len(normalized_confidence)): sbregio = [dist_m[x][y].item() for y in range(len(dist_m[x])) if y!=x] mindist = min(sbregio) list_of_errors.append([(1 -normalized_confidence[x]) * mindist, els[0][2][x].item()]) if normalized_confidence[x] < 1: errors_by_class[els[0][1][x].item()] += 1 total_normalized_confidence += normalized_confidence[x] total += 1 if printiter % 50 == 0: print("\r Avg confidence: {0:.2f}% ({1:.1f}/{2}) {3}".format((total_normalized_confidence / total)*100, total_normalized_confidence, total, ""), end='') printiter += 1 # qui va cambiato sorlist = sorted(list_of_errors, key=lambda xp: xp[0], reverse=True) print("\n Errors by class: {0}".format(["{0}: {1}".format(i, errors_by_class[i]) for i in range(10)])) return [el[1] for el in sorlist[:howmany]]
def restore(self, all, selected, validation, transform=None, name=None): self._train_val_set = customcifar.UnbalancedCIFAR10( root="./cifar", train=True, download=True, transform=transform, filename=name, percentage=.1, provided_indices=(all, validation)) self._test_set = customcifar.UnbalancedCIFAR10( root="./cifar", train=False, download=True, transform=transform) # 10000 self.validation_indices = validation self.train_indices = [ x for x in all if x not in self.validation_indices ] self.already_selected_indices = selected self._train = tud.DataLoader(self._train_val_set, batch_size=train_batch_size, shuffle=False, num_workers=2, sampler=customcifar.CustomRandomSampler( self.already_selected_indices)) self._v = tud.DataLoader(self._train_val_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomRandomSampler( self.validation_indices)) self._t = torch.utils.data.DataLoader( self._test_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler( [x for x in range(len((self._test_set)))])) return self
def __init__(self, transform=None, first_time_multiplier=1, name=None, unbal=True): self._train_val_set = customcifar.UnbalancedCIFAR10( root="./cifar", train=True, download=True, transform=transform, filename=name, percentage=.1) self._test_set = customcifar.UnbalancedCIFAR10( root="./cifar", train=False, download=True, transform=transform) # 10000 self.validation_indices = self._train_val_set._val_indices self.train_indices = [ x for x in self._train_val_set.indices if x not in self.validation_indices ] print([ len([ x for x in self.train_indices if x in self._train_val_set.el_for_class[i] ]) for i in range(10) ]) if unbal: self.already_selected_indices = numpy.random.choice( self.train_indices, size=tslp * first_time_multiplier, replace=False).tolist() else: lenel = [ int(tslp / 10) + (1 if i < tslp % int(tslp / 10) else 0) for i in range(10) ] self.already_selected_indices = [ x for i in range(10) for x in numpy.random.choice([ xx for xx in self._train_val_set.el_for_class[i] if xx not in self.validation_indices ], size=lenel[i], replace=False).tolist() ] print("Selected: {}".format([ len([ x for x in self.already_selected_indices if x in self._train_val_set.el_for_class[i] ]) for i in range(10) ])) self._train = tud.DataLoader(self._train_val_set, batch_size=train_batch_size, shuffle=False, num_workers=2, sampler=customcifar.CustomRandomSampler( self.already_selected_indices)) self._v = tud.DataLoader(self._train_val_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomRandomSampler( self.validation_indices)) self._t = torch.utils.data.DataLoader( self._test_set, batch_size=100, shuffle=False, num_workers=2, sampler=customcifar.CustomSampler( [x for x in range(len((self._test_set)))]))
def distance_and_varratio(self, ds, indices, howmany, train_indices, n=5): distance_weight = 1e-5 varratio_weight = 1 self.net.eval() N = torch.Tensor().to("cuda:0") # labelled S = torch.Tensor().to("cuda:0") # unlabelled normalized_confidence = [torch.Tensor().to("cuda:0"), torch.Tensor().long()] randomized_list = numpy.random.choice([x for x in indices], len(indices), replace=False) trainloaders = [tud.DataLoader(ds._train_val_set, batch_size=500, shuffle=False, num_workers=4, sampler=customcifar.CustomRandomSampler(train_indices)) for i in range(n)] dataloaders = [tud.DataLoader(ds._train_val_set, batch_size=500, shuffle=False, num_workers=4, sampler=customcifar.CustomSampler(randomized_list)) for i in range(n)] with torch.no_grad(): for batch_index, element in enumerate(zip(*trainloaders)): # labelled samples els = [x for x in element] o = torch.Tensor().to("cuda:0") for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") o = torch.cat((o, self.net(input[0])[1].reshape(len(input[0]), 512, 1)), 2) N = torch.cat((N, o), 0) print("\r N: {0} ".format(N.size()), end="") print("") for batch_index, element in enumerate(zip(*dataloaders)): # unlabelled samples normalized_confidence[1] = torch.cat((normalized_confidence[1], element[0][2]), 0) els = [x for x in element] o = torch.Tensor().to("cuda:0") predictions = torch.Tensor().long() for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") output = self.net(input[0]) out = output[1].reshape(len(input[0]), 512, 1) o = torch.cat((o, out), 2) predictions = torch.cat((predictions, output[0].max(1)[1].reshape(len(output[0]), 1).cpu()), 1) normalized_confidence[0] = torch.cat((normalized_confidence[0].cpu(), 1 - torch.Tensor( acquisition_functions.confidence(predictions.transpose(0,1))).cpu() / n), 0).cpu() S = torch.cat((S, o), 0) print("\r S: {0} ".format(S.size()), end="") print("") S = (torch.sum(S, 2)) / n N = (torch.sum(N, 2)) / n S_batches = torch.split(S, 25, dim =0) dist_S_N = torch.Tensor() for el in S_batches: partial_dist = el.unsqueeze(1) - N.unsqueeze(0) partial_dist = torch.sum(partial_dist * partial_dist, -1) partial_dist = torch.sqrt(partial_dist) dist_S_N = torch.cat((dist_S_N, partial_dist.cpu()), 0) mindist = torch.min(dist_S_N, 1)[0].to("cuda:0") normalizing_factor = torch.max(mindist, -1)[0] print("NF : " + str(normalizing_factor)) mindist_confidence = (distance_weight*(mindist / normalizing_factor)) + (varratio_weight * normalized_confidence[0].to("cuda:0")) # devo calcolare la confidenza ancora erlist_indexes = normalized_confidence[1] new_N = [] for i in range(howmany): # maxx = torch.max(mindist, -1)[1] maxx = torch.max(mindist_confidence, -1)[1] print("Max: {0:.3f} = ({1:.3f} * {3}) + ({2:.3f} * {4})".format(mindist_confidence[maxx], mindist[maxx]/normalizing_factor, normalized_confidence[0][maxx], distance_weight, varratio_weight)) if erlist_indexes[maxx].item() in new_N: print("Error: Duplicate") new_N.append(erlist_indexes[maxx].item()) mindist[maxx] = float("-inf") mindist_confidence[maxx] = float("-inf") newdists = S - S[maxx].reshape(1, len(S[maxx])) newdists = torch.sum(newdists * newdists, -1) newdists = torch.sqrt(newdists) mindist = torch.min(mindist, newdists) mindist_confidence = (distance_weight*(mindist / normalizing_factor)) + (varratio_weight * normalized_confidence[0].to("cuda:0")) return new_N
def kl_divergence(self, ds, indices, howmany, train_indices, n=5): self.net.eval() N = torch.Tensor().to("cuda:0") #labelled S = torch.Tensor().to("cuda:0") #unlabelled normalized_confidence = [torch.Tensor().to("cuda:0"), torch.Tensor().long()] randomized_list = numpy.random.choice([x for x in indices], len(indices), replace=False) trainloaders = [tud.DataLoader(ds._train_val_set, batch_size=500, shuffle=False, num_workers=4, sampler=customcifar.CustomRandomSampler(train_indices)) for i in range(n)] dataloaders = [tud.DataLoader(ds._train_val_set, batch_size=500, shuffle=False, num_workers=4, sampler=customcifar.CustomSampler(randomized_list)) for i in range(n)] with torch.no_grad(): for batch_index, element in enumerate(zip(*trainloaders)): #labelled samples els = [x for x in element] o = torch.Tensor().to("cuda:0") for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") o = torch.cat((o, self.net(input[0])[0].reshape(len(input[0]),10, 1)), 2) N = torch.cat((N, o), 0) print("\r N: {0} ".format(N.size()), end="") print("") for batch_index, element in enumerate(zip(*dataloaders)): #unlabelled samples normalized_confidence[1] = torch.cat((normalized_confidence[1], element[0][2]), 0) els = [x for x in element] o = torch.Tensor().to("cuda:0") predictions = torch.Tensor().long().to("cuda:0") for input in els: input[0], input[1] = input[0].to("cuda:0"), input[1].to("cuda:0") out = self.net(input[0])[0].reshape(len(input[0]), 10, 1) o = torch.cat((o, out), 2) predictions = torch.cat((predictions, out.max(1)[1]), 1).to("cuda:0") normalized_confidence[0] = torch.cat((normalized_confidence[0].cpu(), 1.1 - torch.Tensor(acquisition_functions.confidence(predictions.transpose(1, 0))).cpu() / n), 0).cpu() S = torch.cat((S, o), 0) print("\r S: {0} ".format(S.size()), end="") print("") # calc KL divergence S = (torch.sum(F.softmax(S, dim=1), 2)) /n N = (torch.sum(F.softmax(N, dim=1), 2)) /n S_on_N = S.to("cpu").unsqueeze(1) / N.to("cpu").unsqueeze(0) ln_S_on_N = numpy.log2(S_on_N).reshape(len(N), len(S), 10).transpose(0,1) ln_S_on_N_batches = torch.split(ln_S_on_N, 300, dim=0) S_batches = torch.split(S, 300, dim=0) kldiv = torch.Tensor() for i in range(len(ln_S_on_N_batches)): partial_kldiv = torch.bmm(ln_S_on_N_batches[i].to("cuda:0"), S_batches[i].reshape(len(S_batches[i]), 10, 1)).cpu() kldiv = torch.cat((partial_kldiv, kldiv), 0) print(kldiv.size()) kldiv = kldiv.reshape(len(S), len(N)) mindiv = torch.min(kldiv, 1)[0]* normalized_confidence[0] errorlist = [[mindiv[i].item(), normalized_confidence[1][i].item() ]for i in range(len(normalized_confidence[0]))] sorlist = sorted(errorlist, key=lambda xp: xp[0], reverse=True) return [x[1] for x in sorlist[:howmany]]