def test(): if test_set is None: return start_time = time.time() t = test_set.start_test() test_loss = [] with torch.no_grad(): for data in tqdm(test_loader): data = { k: v.to(device) if torch.is_tensor(v) else v for k, v in data.items() } if hasattr(dataset, "prepare"): data = dataset.prepare(data) net_out = run_model(data) test_set.verify_result(t, data, net_out) test_loss.append( dataset.loss(net_out, data["output"]).data.item()) avg_loss = sum(test_loss) / len(test_loss) perplexity = math.exp(avg_loss) test_loss_plot.add_point(epoch, avg_loss) if epoch > 5: # Perplexity is immensely high in the beginning ppl_plot.add_point(epoch, perplexity) test_set.show_test_results(epoch, t) print("Test done in %gs" % (time.time() - start_time))
def _load_dataset( path, mmap_mode=None, as_torch_tensors=True ) -> Tuple[torch_geometric.data.Data, Dict[str, torch.Tensor]]: data = np.load(path, mmap_mode=mmap_mode) data_class = pickle.loads(data['_data_class']) data_dict = {} slices_dict = {} if as_torch_tensors: def convert_tensor(x): return torch.from_numpy(x) else: def convert_tensor(x): return x for k, v in data.items(): if k == '_data_class': continue group, key = k.split('_', 1) if group == 'data': data_dict[key] = convert_tensor(v) elif group == 'slices': slices_dict[key] = convert_tensor(v) else: raise ValueError('Unknown key prefix {} for key {}'.format( group, k)) return data_class.from_dict(data_dict), slices_dict
def _get_text_dim(self, data): for k, v in data.items(): for vv in v: x_text = torch.Tensor(vv['text_embedding_{}'.format( self.text)]) if (len(x_text.size())): return x_text.shape
def filter_data(data, threshold): filtered_data = {} for k, v in data.items(): if len(v) > threshold: filtered_data[k] = v logger.info('root cause numbers: %d' % len(filtered_data)) return filtered_data
def load_android_examples(android_data, test=False): '''yields data in the form of (id, query (title, body), example (title, body), +1/-1)''' neg_file = 'android_data/dev.neg.txt' pos_file = 'android_data/dev.pos.txt' if test: neg_file = 'android_data/test.neg.txt' pos_file = 'android_data/test.pos.txt' d = {} with open(pos_file) as f: for line in f: query, compare = line.split() if query not in d: d[query] = [] d[query].append(compare) with open(neg_file) as f: for line in f: query, compare = line.split() d[query].append(compare) for q, c in d.items(): yield (q, c[0], c[1:])
def extract_embeddings(data_path: typing.AnyStr, ) -> typing.Dict: embeddings_backup_path = data_path + "embeddings.pkl" if not os.path.isfile(embeddings_backup_path): crops_backup_path = data_path + "crops.pkl" if not os.path.isfile(crops_backup_path): data = extract_crop_data(data_path) with open(crops_backup_path, "wb") as fd: pickle.dump(data, fd) else: with open(crops_backup_path, "rb") as fd: data = pickle.load(fd) resnet50_imnet = torchvision.models.resnet50(pretrained=True).eval() imnet_transforms = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) with torch.no_grad(): for seq_name, seq_data in data.items(): for frame_idx, frame_data in seq_data.items(): crop = cv.cvtColor(frame_data["CROP"], cv.COLOR_BGR2RGB) tensor = torch.unsqueeze(imnet_transforms(crop), 0) logits = torch.squeeze(resnet50_imnet(tensor), 0).numpy() embed = compute_resnet_embedding_without_fc( tensor, resnet50_imnet) embed = torch.squeeze(embed, 0).numpy() frame_data["imnet_logits"], frame_data[ "imnet_embed"] = logits, embed with open(embeddings_backup_path, "wb") as fd: pickle.dump(data, fd) else: with open(embeddings_backup_path, "rb") as fd: data = pickle.load(fd) return data
def _examples_to_batch( self, data: Dict[ColumnName, OneorMore[ArrayLike]]) -> ProcessedBatch: """ Converts examples in a dataset to model inputs by using the fields to transform the inputs to tensors. Override in subclass to add custom behavior. """ in_data = {} tgt_data = {} for k, batch in data.items(): fld = self.dataset.fields[k] if isinstance(fld, (tuple, list)): for f, v in zip(fld, batch): data_dict = tgt_data if f.is_target else in_data if k not in data_dict: data_dict[k] = [] data_dict[k].append( f.transform_batch(v, device=self.device, train=self.dataset.train)) else: tsr = fld.transform_batch(batch, device=self.device, train=self.dataset.train) # add to data dicts if fld.is_target: tgt_data[k] = tsr else: in_data[k] = tsr return in_data, tgt_data
def get_loaders(file_path, batch_size, ratio): with open(file_path, "br") as r: data = pickle.load(r) src_lang = PhonemeVocab() trg_lang = WordVocab() src = [] trg = [] for k, v in data.items(): trg_lang.add_word(k) src += v trg += [k for _ in range(len(v))] # shuffle data d = list(zip(src, trg)) random.shuffle(d) src, trg = zip(*d) src, trg = list(src), list(trg) r_range = get_ratio_range(len(src), ratio) data_loaders = [] for r in range(len(ratio) - 1): sub_src = src[r_range[r]:r_range[r + 1]] sub_trg = trg[r_range[r]:r_range[r + 1]] dataset = DomainDataset(sub_src, sub_trg, src_lang.sent2idx, trg_lang.word2idx) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size[r], shuffle=True, drop_last=True, collate_fn=collate_fn) data_loaders.append(data_loader) return data_loaders, src_lang, trg_lang
def get_video_names_and_annotations(data, subset): video_names = [] annotations = [] for key, value in data.items(): this_subset = value['subset'] if this_subset == subset: if subset == 'testing': video_names.append('test/{}'.format(key)) elif subset == 'train': st = int(value['annotations']['segment'][0]) end = int(value['annotations']['segment'][1]) label = value['annotations']['label'].replace(' ', '_') video_names.append('{}/{}_{}_{}'.format( label, key, str(st).zfill(6), str(end).zfill(6))) annotations.append(value['annotations']) else: label = value['annotations']['label'].replace(' ', '_') video_names.append('{}/{}'.format(label, key)) annotations.append(value['annotations']) return video_names, annotations
def main(): # the_url contains the required url to obtain the full list using an identifier prefix = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=' file = open('./imagenet_class_info.json') data = json.load(file) class_folder_path = os.path.join(args.data_root, args.main_class) if not os.path.isdir(class_folder_path): os.makedirs(class_folder_path) wnids = [] for k, v in data.items(): if v['class_name'] in args.subclass_list: wnids.append(k) print(wnids) for wnid in wnids: the_list_url = prefix + wnid resp = requests.get(the_list_url) urls = [url.decode('utf-8') for url in resp.content.splitlines()] num_images = 0 for url in urls: if num_images >= args.images_per_subclass: print(num_images) break num_images += get_image(url, class_folder_path)
def __iter__(self): for data in super(FixedDatasetLoader, self).__iter__(): if isinstance(data, (list, tuple)): assert len(data) == len(self.field_names) yield dict(zip(self.field_names, data)) else: yield {self.rename_dict.get(k, k): v for k, v in data.items()}
def __init__(self, data_json_path, batch_size, max_length_in, max_length_out, num_batches=0): # From: espnet/src/asr/asr_utils.py: make_batchset() """ Args: data: espnet/espnet json format file. num_batches: for debug. only use num_batches minibatch but not all. """ super(AudioDataset, self).__init__() with open(data_json_path, 'rb') as f: data = json.load(f)['utts'] # sort it by input lengths (long to short) sorted_data = sorted(data.items(), key=lambda data: int( data[1]['input'][0]['shape'][0]), reverse=True) # change batchsize depending on the input and output length minibatch = [] start = 0 while True: ilen = int(sorted_data[start][1]['input'][0]['shape'][0]) olen = int(sorted_data[start][1]['output'][0]['shape'][0]) factor = max(int(ilen / max_length_in), int(olen / max_length_out)) # if ilen = 1000 and max_length_in = 800 # then b = batchsize / 2 # and max(1, .) avoids batchsize = 0 b = max(1, int(batch_size / (1 + factor))) end = min(len(sorted_data), start + b) minibatch.append(sorted_data[start:end]) if end == len(sorted_data): break start = end if num_batches > 0: minibatch = minibatch[:num_batches] self.minibatch = minibatch
def log_write(fid, data, step = 0): try: import tensorflow as tf fid, summary, sess, var = fid feed_dict = {} for k in var.keys(): if k in data.keys(): feed_dict[var[k]] = data[k] else: feed_dict[var[k]] = 0 summary = sess.run(summary, feed_dict = feed_dict) fid.add_summary(summary, step) except ModuleNotFoundError as e: try: fid.add_scalars('train', data, step) except AttributeError as e: s = 'Step: %d\t' % step for i,(k, v) in enumerate(data.items()): try: v = float(v) s += '%s:%3.5f' % (k, float(v)) if i != len(data) - 1: s += '\t' else: s += '\n' except: continue fid.write(s) fid.flush()
def __getitem__(self, index): # print(self.image_list[index]) data = self.data_list[index] img_path, text_polys, text_tags = self.data_list[index]['img_path'], self.data_list[index]['text_polys'], self.data_list[index]['ignore_tags'] data = image_label(data, input_size=self.data_shape,n=self.n,m=self.m) im = cv2.imread(img_path) if self.transform: img = self.transform(data['img']) shape = (data['img'].shape[0], data['img'].shape[1]) data['img'] = img data['shape'] = shape # data['score_maps'] = score_maps # data['training_mask'] = training_mask # data['text_polys'] =torch.Tensor(list(text_polys)) # data['ignore_tags'] = [text_tags] # data['shape'] = shape # data['texts'] = [data['texts']] if len(self.filter_keys): data_dict = {} for k, v in data.items(): if k not in self.filter_keys: data_dict[k] = v return data_dict else: # return {'img': img, 'score_maps': score_maps, 'training_mask': training_mask, 'shape': shape, 'text_polys': list(text_polys), 'ignore_tags': text_tags} return {}
def load_next_buffer(self): """ Loads next buffer """ self._buffer_fnames = self._files[self. _buffer_index:self._buffer_index + self._buffer_size] self._buffer_index += self._buffer_size self._buffer_index = self._buffer_index % len(self._files) self._buffer = [] self._cum_size = [0] # progress bar pbar = tqdm(total=len(self._buffer_fnames), bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} {postfix}') pbar.set_description("Loading file buffer ...") for f in self._buffer_fnames: with np.load(f) as data: self._buffer += [{k: np.copy(v) for k, v in data.items()}] self._cum_size += [ self._cum_size[-1] + self._data_per_sequence(data['rewards'].shape[0]) ] pbar.set_description_str('data_len{}'.format( data['rewards'].shape[0])) pbar.update(1) pbar.close()
def validate_imagenet(val_loader, model, args): batch_time = AverageMeter('Time', ':6.3f') top1 = AverageMeter('Real Acc@1', ':6.2f') top5 = AverageMeter('Real Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), [batch_time, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, data in enumerate(val_loader): if args.gpu is not None or torch.cuda.is_available(): data = {k: v.cuda(args.gpu, non_blocking=True) for k, v in data.items()} # compute output out = model(data['ims'].cuda()) # measure accuracy and record loss acc1, acc5 = accuracy(out['avg_preds'], data['labels'], topk=(1, 5)) top1.update(acc1[0], data['labels'].size(0)) top5.update(acc5[0], data['labels'].size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # logging if i % args.print_freq == 0: progress.display(i) print(f'* Real: Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}') return {'acc1/1_real': top1.avg, 'acc5/1_real': top5.avg}
def to_cpu(data): if isinstance(data, dict): for k, v in data.items(): data[k] = to_cpu(v) return data if isinstance(data, torch.Tensor): return data.cpu() return data
def write_element(descriptor, data, name): if isinstance(data, torch.Tensor): descriptor.create_dataset(name, data=data) return assert isinstance(data, dict), name subgroup = descriptor.create_group(name) for k, v in data.items(): write_element(subgroup, v, k)
def to_device(data): if type(data) == dict: for k, v in data.items(): data[k] = v.to(device=device) else: for idx, item in enumerate(data): data[idx] = item.to(device=device) return data
def fromdict(cls, data, fields): ex = cls() for key, val in data.items(): if key in fields: name, field = fields[key] if field is not None: setattr(ex, name, field.preprocess(val)) return ex
def __init__(self, data, uk_data, set_type, config): super().__init__() self.uk_data = uk_data print(set_type, set(list(self.uk_data.values()))) c = Counter(list(self.uk_data.values())) print(c) self.audio_files = list(data.keys()) self.set_type = set_type self.audio_labels = list(data.values()) print(np.bincount(np.array(self.audio_labels))) config["bg_noise_files"] = list( filter(lambda x: x.endswith("wav"), config.get("bg_noise_files", []))) self.bg_noise_audio = [ librosa.core.load(file, sr=16000)[0] for file in config["bg_noise_files"] ] self.unknown_prob = config["unknown_prob"] self.silence_prob = config["silence_prob"] self.noise_prob = config["noise_prob"] self.n_dct = config["n_dct_filters"] self.input_length = config["input_length"] self.timeshift_ms = config["timeshift_ms"] self.filters = librosa.filters.dct(config["n_dct_filters"], config["n_mels"]) self.n_mels = config["n_mels"] self._audio_cache = SimpleCache(config["cache_size"]) self._file_cache = SimpleCache(config["cache_size"]) n_unk = len(list(filter(lambda x: x == 1, self.audio_labels))) print(set_type, self.unknown_prob) self.uk_audio_files = [k for (k, v) in data.items() if v == 1] self.n_unknown = int( self.unknown_prob * (len(self.audio_labels) - len(self.uk_audio_files))) #shuffle(self.uk_audio_files) self.uk_index = 0 self.n_silence = int( self.silence_prob * (len(self.audio_labels) - len(self.uk_audio_files))) self.audio_files = [k for (k, v) in data.items() if v != 1] self.audio_labels = [v for (k, v) in data.items() if v != 1]
def get_rc_data(data, rc_lists): new_data = {} count = 0 for k, v in data.items(): if k in rc_lists: new_data[k] = v count += len(v) logger.info('Remained data numbers: %d' % count) return new_data
def __init__(self, dict_path="data/data.pkl", val_size=50): with open(dict_path, 'rb') as f: data = pickle.load(f) data_items = list(data.items()) random.shuffle(data_items) self.data_dict = dict(data_items) self.maxlen = 1500
def read_object_labels(root, dataset, set): path_labels = os.path.join(root, 'VOCdevkit', dataset, 'ImageSets', 'Main') labeled_data = dict() num_classes = len(object_categories) for i in range(num_classes): file = os.path.join(path_labels, object_categories[i] + '_' + set + '.txt') data = read_image_label(file) if i == 0: for (name, label) in data.items(): labels = np.zeros(num_classes) labels[i] = label labeled_data[name] = labels else: for (name, label) in data.items(): labeled_data[name][i] = label return labeled_data
def get_data(mode): data = json.load(open('./lab5_dataset/' + mode + '.json', 'r')) if mode == 'train': Data = [] for item in data.items(): Data.append(item) return np.squeeze(Data) else: return data
def __permute_data__(self, data, node_idx, adj): out = copy.copy(data) for key, value in data.items(): if data.is_node_attr(key): out[key] = value[node_idx] out.edge_index = None out.adj = adj return out
def collate_fn(batch): r"""Puts each data field into a tensor with outer dimension batch size""" flattened_batch = [] for data in batch: num_examples = len(data['image']) for i in range(num_examples): flattened_batch.append({k: v[i] for k, v in data.items()}) return default_collate(flattened_batch)
def evaluate(self, data: dict, model: Model, path_to_model: str = "") -> dict: """ Evaluates the saved best model against train, val and test data :param data: a dictionary tuple containing the data loaders for train, val and test :param model: the model to be evaluated :param path_to_model: the path to the saved serialization of the best model :return: the eval of the model on train, val and test data, including metrics, gt and preds """ model.evaluation_mode() if path_to_model != "": model.load(path_to_model) metrics, gt, preds = {}, {}, {} for set_type, dataloader in data.items(): loss, accuracy, y_scores, y_true = [], [], [], [] with torch.no_grad(): for i, (x, y) in enumerate(dataloader): x, y = x.float().to(self.__device), y.long().to( self.__device) o = model.predict(x).to(self.__device) loss += [model.get_loss(o, y)] accuracy += [self.batch_accuracy(o, y)] y_scores += torch.exp(o).cpu().numpy().tolist() y_true += y.cpu().numpy().tolist() y_scores, y_true = np.array(y_scores).reshape( (len(y_scores), 2)), np.array(y_true) y_pred = np.array((y_scores[:, 1] >= self.__optimal_roc_threshold( y_true, y_scores[:, 1])), dtype=np.int) set_metrics = self.__compute_metrics(y_true, y_pred, y_1_scores=y_scores[:, 1]) set_metrics["accuracy"], set_metrics["loss"] = np.mean( accuracy), np.mean(loss) print("\n {} metrics: \n".format(set_type.upper())) for metric, value in set_metrics.items(): print(("\t - {} " + "".join(["."] * (15 - len(metric))) + " : {:.4f}").format(metric, value)) metrics[set_type], gt[set_type], preds[ set_type] = set_metrics, y_true, y_pred return {"metrics": metrics, "gt": gt, "preds": preds}
def validate_counterfactual(val_loader, model, args): batch_time = AverageMeter('Time', ':6.3f') top1_shape = AverageMeter('Shape Acc@1', ':6.2f') top5_shape = AverageMeter('Shape Acc@5', ':6.2f') top1_texture = AverageMeter('Texture Acc@1', ':6.2f') top5_texture = AverageMeter('Texture Acc@5', ':6.2f') top1_bg = AverageMeter('Bg Acc@1', ':6.2f') top5_bg = AverageMeter('Bg Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), [batch_time, top1_shape, top5_shape, top1_texture, top5_texture, top1_bg, top5_bg], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, data in enumerate(val_loader): if args.gpu is not None or torch.cuda.is_available(): data = {k: v.cuda(args.gpu, non_blocking=True) for k, v in data.items()} # compute output out = model(data['ims']) # measure accuracy and record loss sz = len(data['ims']) acc1, acc5 = accuracy(out['shape_preds'], data['shape_labels'], topk=(1, 5)) top1_shape.update(acc1[0], sz) top5_shape.update(acc5[0], sz) acc1, acc5 = accuracy(out['texture_preds'], data['texture_labels'], topk=(1, 5)) top1_texture.update(acc1[0], sz) top5_texture.update(acc5[0], sz) acc1, acc5 = accuracy(out['bg_preds'], data['bg_labels'], topk=(1, 5)) top1_bg.update(acc1[0], sz) top5_bg.update(acc5[0], sz) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # logging if i % args.print_freq == 0: progress.display(i) print(f'* Shape: Acc@1 {top1_shape.avg:.3f} Acc@5 {top5_shape.avg:.3f}') print(f'* Texture: Acc@1 {top1_texture.avg:.3f} Acc@5 {top5_texture.avg:.3f}') print(f'* BG: Acc@1 {top1_bg.avg:.3f} Acc@5 {top5_bg.avg:.3f}') return {'acc1/2_shape': top1_shape.avg, 'acc1/3_texture': top1_texture.avg, 'acc1/4_bg': top1_bg.avg, 'acc5/2_shape': top5_shape.avg, 'acc5/3_texture': top5_texture.avg, 'acc5/4_bg': top5_bg.avg}
def _to_device(data, device): if isinstance(data, torch.Tensor): return data.to(device) elif isinstance(data, (tuple, list)): return [_to_device(d, device) for d in data] elif isinstance(data, dict): return {k: _to_device(v, device) for k, v in data.items()} elif isinstance(data, np.ndarray): return torch.tensor(data).to(device) else: return data
def _apply_to_data(data, func, unpack_dict=False): """Apply a function to data, trying to unpack different data types. """ apply_ = partial(_apply_to_data, func=func, unpack_dict=unpack_dict) if isinstance(data, dict): if unpack_dict: return [apply_(v) for v in data.values()] return {k: apply_(v) for k, v in data.items()} elif isinstance(data, (list, tuple)): try: # e.g.list/tuple of arrays return [apply_(x) for x in data] except TypeError: return func(data) return func(data)
def load_next_buffer(self): """ Loads next buffer """ self._buffer_fnames = self._files[self._buffer_index:self._buffer_index + self._buffer_size] self._buffer_index += self._buffer_size self._buffer_index = self._buffer_index % len(self._files) self._buffer = [] self._cum_size = [0] # progress bar pbar = tqdm(total=len(self._buffer_fnames), bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} {postfix}') pbar.set_description("Loading file buffer ...") for f in self._buffer_fnames: with np.load(f) as data: self._buffer += [{k: np.copy(v) for k, v in data.items()}] self._cum_size += [self._cum_size[-1] + self._data_per_sequence(data['rewards'].shape[0])] pbar.update(1) pbar.close()