def _build_data_loaders(self): transforms = self._get_transforms() # DEFINE: DATASETS train_dataset = Dataset( data_dir=self.data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) test_dataset = Dataset( data_dir=self.test_data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) # DEFINE: DATA LOADER self.train_data_loader = DataLoader( dataset=train_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader = DataLoader( dataset=test_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader_ = iter(self.test_data_loader)
def __init__(self): dataset = Dataset('config_compas.json') x, y = dataset.get_data(readable=True) # r = "Af_vs_all" r = "Af_vs_Caucasian" # r = "all" x, y = get_dataframe(x, y, requested=r) self.finder = RelationshipsFinder(pd.concat([x, y], axis=1))
def __init__(self, args, root, split, labelpath, cachedir, transform=None, target_transform=None, test_gap=50): Dataset.__init__(self, test_gap, split) self.num_classes = 174 self.transform = transform self.target_transform = target_transform self.cls2int = self.parse_something_labels(args.label_file) self.labels = self.parse_something_json(labelpath, self.cls2int) self.root = root cachename = '{}/{}_{}.pkl'.format(cachedir, self.__class__.__name__, split) self._data = cache(cachename)(self._prepare)(root, self.labels, split)
def __init__(self, positive=True, limit_sentences=None, dataset_cache_dir=None, dataset_name=None): Dataset.__init__(self, limit_sentences=limit_sentences, dataset_cache_dir=dataset_cache_dir, dataset_name=dataset_name) self.positive = positive
def train(args): train_transforms = transforms.Compose([ transforms.Resize(args.image_shape), transforms.RandomHorizontalFlip(), transforms.Normalize() ]) eval_transforms = transforms.Compose( [transforms.Resize(args.image_shape), transforms.Normalize()]) train_dataset = Dataset( data_dir=args.data_dir, file_list=args.train_list, transforms=train_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=True) eval_dataset = None if args.val_list is not None: eval_dataset = Dataset( data_dir=args.data_dir, file_list=args.val_list, transforms=eval_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=False) if args.model_type == 'HumanSegMobile': model = HumanSegMobile(num_classes=2) elif args.model_type == 'HumanSegLite': model = HumanSegLite(num_classes=2) elif args.model_type == 'HumanSegServer': model = HumanSegServer(num_classes=2) else: raise ValueError( "--model_type: {} is set wrong, it shold be one of ('HumanSegMobile', " "'HumanSegLite', 'HumanSegServer')".format(args.model_type)) model.train( num_epochs=args.num_epochs, train_dataset=train_dataset, train_batch_size=args.batch_size, eval_dataset=eval_dataset, save_interval_epochs=args.save_interval_epochs, save_dir=args.save_dir, pretrained_weights=args.pretrained_weights, resume_weights=args.resume_weights, learning_rate=args.learning_rate, use_vdl=args.use_vdl)
def __init__(self, txt_fn="data/train_phototourism_ms.txt", style2style=False): Dataset.__init__(self) self.txt_fn = txt_fn self.style2style = style2style self.path_orig_img = "/ssd/data/phototourism/orig" self.path_style_img = "/data/datasets/phototourism/style_transfer_all" self.style2fnames = get_style2fnames() # a dirty hack self.crop_size = 192 self.imgs = [line.rstrip("\n") for line in open(self.txt_fn)] self.nimg = len(self.imgs) self.npairs = len(self.imgs)
def _build_data_loaders(self): transforms = self._get_transforms() extraimg_transform = torchvision.transforms.Compose( transforms["frame"].transforms[1:]) # DEFINE: DATASETS train_dataset = Dataset( data_dir=self.data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) test_dataset = Dataset( data_dir=self.test_data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) train_extraimg_dataset = IMGDataset( data_dir=self.extraimg_data_dir, data_type=self.extraimg_type, transform=extraimg_transform, ) # DEFINE: DATA LOADER self.train_data_loader = DataLoader( dataset=train_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader = DataLoader( dataset=test_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.train_extraimg_data_loader = DataLoader( dataset=train_extraimg_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader_ = iter(self.test_data_loader)
def main(config_file): """ :param config_file: :return: """ tf.reset_default_graph() with open(config_file) as config_file: config = json.load(config_file) dset = Dataset(config['dset_name'], config['dset_config']) model_file = get_model_file(config) with tf.device(config['device']): model = construct_model(config['dset_name']) attack = construct_attack(model, config, dset) saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore the checkpoint saver.restore(sess, model_file) # Iterate over the samples batch-by-batch num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) x_adv = [] # adv accumulator print('Iterating over {} batches'.format(num_batches)) for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) print('batch size: {}'.format(bend - bstart)) x_batch, y_batch = dset.get_eval_data(bstart, bend) x_batch_adv = attack.perturb(x_batch, y_batch, sess) x_adv.append(x_batch_adv) print('Storing examples') path = data_path_join(config['store_adv_path']) x_adv = np.concatenate(x_adv, axis=0) np.save(path, x_adv) print('Examples stored in {}'.format(path))
def plot_interactive(f_2d, indexs, dataset_obj: Dataset, labels): n = f_2d.shape[0] d_point_index = {i: indexs[i] for i in range(f_2d.shape[0])} img = imshow_util_uint8( dataset_obj.get_train_image_at(indexs[0])[0][0], dataset_obj) # create figure and plot scatter fig, axis = plt.subplots(1, 2) line = axis[0].scatter(f_2d[:, 0], f_2d[:, 1], c=labels.reshape(n), s=5) temp = axis[1].imshow(img) def hover(event): # if the mouse is over the scatter points if line.contains(event)[0]: ind = line.contains(event)[1]["ind"][0] r_ind = d_point_index[ind] print("Getting {0} index".format(r_ind)) img = imshow_util_uint8( dataset_obj.get_train_image_at(r_ind)[0][0], dataset_obj) temp.set_data(img) fig.canvas.draw_idle() # add callback for mouse moves fig.canvas.mpl_connect('motion_notify_event', hover) plt.show()
def gen_s_curve(rng, emissions): """Generate synthetic data from datasets generating process. """ N = 500 J = 100 D = 2 # Generate latent manifold. # ------------------------- X, t = make_s_curve(N, random_state=rng) X = np.delete(X, obj=1, axis=1) X = X / np.std(X, axis=0) inds = t.argsort() X = X[inds] t = t[inds] # Generate kernel `K` and latent GP-distributed maps `F`. # ------------------------------------------------------- K = kern.RBF(input_dim=D, lengthscale=1).K(X) F = rng.multivariate_normal(np.zeros(N), K, size=J).T # Generate emissions using `F` and/or `K`. # ---------------------------------------- if emissions == 'bernoulli': P = logistic(F) Y = rng.binomial(1, P).astype(np.double) return Dataset('s-curve', False, Y, X, F, K, None, t) if emissions == 'gaussian': Y = F + np.random.normal(0, scale=0.5, size=F.shape) return Dataset('s-curve', False, Y, X, F, K, None, t) elif emissions == 'multinomial': C = 100 pi = np.exp(F - logsumexp(F, axis=1)[:, None]) Y = np.zeros(pi.shape) for n in range(N): Y[n] = rng.multinomial(C, pi[n]) return Dataset('s-curve', False, Y, X, F, K, None, t) elif emissions == 'negbinom': P = logistic(F) R = np.arange(1, J + 1, dtype=float) Y = rng.negative_binomial(R, 1 - P) return Dataset('s-curve', False, Y, X, F, K, R, t) else: assert (emissions == 'poisson') theta = np.exp(F) Y = rng.poisson(theta) return Dataset('s-curve', False, Y, X, F, K, None, t)
def randomKway(name, number, marginal, seed=0): if name.endswith('-small'): dataset_name = name[:-6] else: dataset_name = name path = f"datasets/{dataset_name}.csv" domain = f"datasets/{name}-domain.json" data = Dataset.load(path, domain) return data, randomKwayData(data, number, marginal, seed)
def train(self, config, word2vec, tokenizer): from datasets.dataset import Dataset, DatasetParam dataset_args = DatasetParam() dataset_args.output_dir = config['data_params']['output_dir'] dataset_args.embed_dim = config['data_params']['embed_dim'] dataset_args.max_sentence_len = config['data_params'][ 'max_sentence_len'] dataset_args.min_word_freq = config['data_params']['min_word_freq'] dataset_args.max_vocab_size = config['data_params']['max_vocab_size'] dataset_args.test_rate = config['data_params']['test_rate'] dataset_args.tokenizer = tokenizer dataset_args.data_dir = config['data_params']['data_dir'] dataset_args.cate_list = config['model_params']['cate_list'] dataset_args.word2vec_iterator = word2vec dataset_args.data_vocab_dir = config['data_params']['data_vocab_dir'] dataset_args.data_vocab_tag = str( config['data_params']['data_vocab_tag']) dataset_args.data_file = config['data_params']['data_file'] dataset = Dataset(dataset_args) train_set, test_set = dataset.buildWithAllData(False) x_train, y_train = zip(*train_set) x_train = np.array(x_train) y_train = np.array(y_train) x_test, y_test = zip(*test_set) x_test = np.array(x_test) y_test = np.array(y_test) # 加载贝叶斯模型 from sklearn.naive_bayes import BernoulliNB from sklearn.externals import joblib classifier = BernoulliNB() # 训练模型并保存 classifier.fit(x_train, y_train) joblib.dump(classifier, os.path.join(dataset_args.output_dir, 'bayes_model.m')) # 验证并计算acc y_ = classifier.predict(x_test) acc = np.mean( [1 if y_[i] == y_test[i] else 0 for i in range(y_test.shape[0])], axis=0) print("eval acc: %f" % acc)
def load_bridges(): """Load NYC bridges dataset: https://data.cityofnewyork.us/Transportation/ Bicycle-Counts-for-East-River-Bridges/gua4-p9wg """ data = np.load(f'datasets/bridges.npy', allow_pickle=True) data = data[()] Y = data['Y'] labels = data['labels'] return Dataset('bridges', True, Y, labels=labels)
def __init__(self, args, root, split, labelpath, cachedir, transform=None, target_transform=None, input_size=224, test_gap=10): Dataset.__init__(self, test_gap, split) self.num_classes = 80 self.transform = transform self.target_transform = target_transform self.cls2int = dict((str(x + 1), x) for x in range(80)) self.labels = self.parse_ava_csv(labelpath, self.cls2int) self.root = root self.train_gap = 64 self.input_size = input_size cachename = '{}/{}_{}.pkl'.format(cachedir, self.__class__.__name__, split) self._data = cache(cachename)(self._prepare)(root, self.labels, split)
def main(): config = Config() create_dirs([config.summary_dir, config.checkpoint_dir]) sess = tf.Session() train_data = Dataset(config.root, config.train_image_file, config.type, transform=Augmentaton(size=config.resize, mean=config.means[config.type], std=config.stds[config.type]), max_samples=None) valid_data = Dataset(config.root, config.valid_image_file, config.type, transform=Augmentaton(size=config.resize, mean=config.means[config.type], std=config.stds[config.type]), max_samples=None) train_data_loader = DataLoader(train_data) valid_data_loader = DataLoader(valid_data) model = DenseNet(config) logger = Logger(sess, config) trainer = DenseNetTrainer(sess, model, train_data_loader, valid_data_loader, config, logger) model.load(sess) if config.phase == "train": trainer.train() elif config.phase == "test": trainer.test("prediction.csv")
def __init__(self, args, root, split, label_path, cachedir, transform=None, target_transform=None, input_size=224, test_gap=25, train_gap=4): Dataset.__init__(self, test_gap, split) self.num_classes = 400 self.transform = transform self.target_transform = target_transform self.cls2int = self.parse_kinetics_labels(args.train_file) self.labels = self.parse_kinetics_csv(label_path, self.cls2int) self.root = root self.train_gap = train_gap self.input_size = input_size cachename = '{}/{}_{}.pkl'.format(cachedir, self.__class__.__name__, split) self._data = cache(cachename)(self._prepare)(root, self.labels, split)
def evaluate(args): eval_transforms = transforms.Compose( [transforms.Resize(args.image_shape), transforms.Normalize()]) eval_dataset = Dataset(data_dir=args.data_dir, file_list=args.val_list, transforms=eval_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=False) model = models.load_model(args.model_dir) model.evaluate(eval_dataset, args.batch_size)
def load_congress(): """Congress 109 data: https://github.com/jgscott/STA380/blob/master/data/congress109.csv https://github.com/jgscott/STA380/blob/master/data/congress109members.csv """ df1 = pd.read_csv(f'datasets/congress109.csv') df2 = pd.read_csv(f'datasets/congress109members.csv') assert (len(df1) == len(df2)) # Ensure same ordering. df1 = df1.sort_values(by='name') df2 = df2.sort_values(by='name') Y = df1.values[:, 1:].astype(int) labels = np.array([0 if x == 'R' else 1 for x in df2.party.values]) return Dataset('congress109', True, Y, labels=labels)
def test(**kwargs): opt.parse(kwargs) if opt.device is not None: opt.device = torch.device(opt.device) elif opt.gpus: opt.device = torch.device(0) else: opt.device = torch.device('cpu') pretrain_model = load_pretrain_model(opt.pretrain_model_path) generator = GEN(opt.dropout, opt.image_dim, opt.text_dim, opt.hidden_dim, opt.bit, pretrain_model=pretrain_model).to(opt.device) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) load_model(generator, path) generator.eval() images, tags, labels = load_data(opt.data_path, opt.dataset) i_query_data = Dataset(opt, images, tags, labels, test='image.query') i_db_data = Dataset(opt, images, tags, labels, test='image.db') t_query_data = Dataset(opt, images, tags, labels, test='text.query') t_db_data = Dataset(opt, images, tags, labels, test='text.db') i_query_dataloader = DataLoader(i_query_data, opt.batch_size, shuffle=False) i_db_dataloader = DataLoader(i_db_data, opt.batch_size, shuffle=False) t_query_dataloader = DataLoader(t_query_data, opt.batch_size, shuffle=False) t_db_dataloader = DataLoader(t_db_data, opt.batch_size, shuffle=False) qBX = generate_img_code(generator, i_query_dataloader, opt.query_size) qBY = generate_txt_code(generator, t_query_dataloader, opt.query_size) rBX = generate_img_code(generator, i_db_dataloader, opt.db_size) rBY = generate_txt_code(generator, t_db_dataloader, opt.db_size) query_labels, db_labels = i_query_data.get_labels() query_labels = query_labels.to(opt.device) db_labels = db_labels.to(opt.device) mapi2t = calc_map_k(qBX, rBY, query_labels, db_labels) mapt2i = calc_map_k(qBY, rBX, query_labels, db_labels) print('...test MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i))
def load_test_dataset(data_dir, syntax, max_example_actions_num): # all with unary closures terminal_vocab_file = os.path.join(data_dir, 'terminal_vocab.txt') grammar_file = os.path.join(data_dir, 'grammar.txt.uc.bin') grammar = deserialize_from_file(grammar_file) terminal_vocab = Vocab( terminal_vocab_file, data=[Constants.UNK_WORD, Constants.EOS_WORD, Constants.PAD_WORD]) vocab = Vocab( os.path.join(data_dir, 'vocab.txt'), data=[Constants.UNK_WORD, Constants.EOS_WORD, Constants.PAD_WORD]) prefix = 'uc_' + syntax + '_' test_dir = os.path.join(data_dir, 'test') test = Dataset(test_dir, 'test', grammar, vocab, terminal_vocab, syntax, max_example_actions_num, True) torch.save(test, test_file)
def evaluate(args): eval_transforms = transforms.Compose( [transforms.Resize((192, 192)), transforms.Normalize()]) eval_dataset = Dataset(data_dir=args.data_dir, file_list=args.quant_list, transforms=eval_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=False) model = models.load_model(args.model_dir) model.export_quant_model(dataset=eval_dataset, save_dir=args.save_dir, batch_size=args.batch_size, batch_nums=args.batch_nums)
def generate(self, dataset_object: Dataset, index_list, mask_file, select_path): d_name = str(dataset_object.__class__.__name__) gen_name = str(self.__class__.__name__) f_name = '{0}_{1}_{2}'.format(d_name, gen_name, now_string()) out_folder = os.path.join('gen_images', f_name) os.makedirs(out_folder, exist_ok=True) gen_dict = {} # open masks pickle with open(mask_file, 'rb') as f: mask_dict = pickle.load(f) for ind in index_list: mask = mask_dict['masks'][ind] img = dataset_object.get_train_image_at(ind)[0][ 0] # returns (img,label) , img = [batch,w,h,c] result = self.generate_img_mask(img, mask) cv2.imwrite(os.path.join(out_folder, '{0}__mask.png'.format(ind)), mask.astype(np.uint8) * 255) gen_dict[ind] = [] for ind_out, elem in enumerate(result): out_path_img = os.path.join( out_folder, '{0}__{1}.png'.format(ind, ind_out)) cv2.imwrite(out_path_img, elem) gen_dict[ind].append(out_path_img) exp_json = { 'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'dataset': d_name, 'used_select': select_path, 'index_map': gen_dict, 'mask_file': str(mask_file), 'generator': gen_name } print("Results in " + str(os.path.join(out_folder, 'exp_details.json'))) with open(os.path.join(out_folder, 'exp_details.json'), 'w') as f: json.dump(exp_json, f)
def get_dummy_data(domain, data_size, query_manager=None): dis = {} for attr, n in zip(domain.attrs, domain.shape): random_dist = np.random.exponential(10, n) random_dist = random_dist / np.sum(random_dist) dis[attr] = random_dist arr = [ np.random.choice(n, data_size, p=dis[attr]) for attr, n in zip(domain.attrs, domain.shape) ] values = np.array(arr).T df = pd.DataFrame(values, columns=domain.attrs) data = Dataset(df, domain) if query_manager is not None: ans = query_manager.get_answer(data) print("max answer: ", np.max(ans)) plt.hist(ans) plt.show() return data
def _get_transforms(self): # for normalizer of mel mel_data_loader = None if not os.path.isfile(self.data_config["mel_normalizer_savefile"]): mel_dataset = Dataset( data_dir=self.data_dir, transforms={}, load_files=["log_mel_spec", "mel_if"], ) mel_data_loader = DataLoader( dataset=mel_dataset, batch_size=self.model_config["batch_size"], shuffle=False, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) # Data definitions frame_transforms = [ torchvision.transforms.ToPILImage(), torchvision.transforms.Resize((256, 256)), torchvision.transforms.RandomHorizontalFlip(p=0.5), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] if self.model_config["flip"] is not True: flip_transform = frame_transforms.pop(1) assert isinstance(flip_transform, torchvision.transforms.RandomHorizontalFlip) transforms = { "frame": torchvision.transforms.Compose(frame_transforms), "mel": MelNormalizer( dataloader=mel_data_loader, savefile_path=self.data_config["mel_normalizer_savefile"], ), } return transforms
def main(config_file): """ :param config_file: :return: """ # deallocate memory if any tf.reset_default_graph() #free_gpus() # load configs. with open(config_file) as config_file: config = json.load(config_file) # load dataset dset = Dataset(config['dset_name'], config['dset_config']) with tf.device(config['device']): model = construct_model(config['dset_name']) x_adv = np.load(data_path_join(config['store_adv_path'])) model_file = get_model_file(config) num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] target_shape = (num_eval_examples, ) + get_dataset_shape( config['dset_name']) check_values(x_adv, dset.min_value, dset.max_value) check_shape(x_adv, target_shape) res = get_res(model_file, x_adv, config['attack_config']['epsilon'], model, dset, num_eval_examples=num_eval_examples, eval_batch_size=eval_batch_size) return res
def _build_data_loader(data_dir, batch_size=256, sr=16000): transforms = {} if sr != 32000: transforms["audio"] = lambda audio: librosa.resample(audio, sr, 32000) print(f"[!] sr: {sr} -> 32000") # DEFINE: DATASETS train_dataset = Dataset( data_dir=data_dir, transforms=transforms, load_files=["audio"], ) # DEFINE: DATA LOADER train_data_loader = DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=batch_size // 2, ) return train_data_loader
def get_loader(csv_dir, split, resample, slices_per_example, batch_size, num_workers, toy, input_scan, output_scan): """Initialize the data loader""" csv_dir = Path(csv_dir) # Default csv path is csv_dir / train.csv or whatever split is csv_path = str(csv_dir / f'{split}.csv') dataset = Dataset(csv_path=csv_path, split=split, toy=toy, input_scan=input_scan, output_scan=output_scan, resample=resample, num_slices=slices_per_example) loader = data.DataLoader(dataset, batch_size=batch_size, drop_last=False, pin_memory=True, num_workers=num_workers) return loader
def instantiate_net(args, Train=True): module = importlib.import_module('Models.Class' + args['3_model_class']) class_ = getattr(module, args['3_model_class']) my_net = class_() with_GT = True if args['3_dimension'] == '': rescale_factor = 'Default' else: rescale_factor = args['3_dimension'] my_dataset = Dataset() if Train: my_dataset.load_train(args['3_ds'], rescale_factor) my_dataset.load_test(args['3_ds'], rescale_factor) my_net.create_model(args, my_dataset) return my_dataset, my_net
def get_dummy_data2(domain, data_size, query_manager, display=False): num_attr = len(domain.attrs) bag = {} for i in range(len(query_manager.workloads)): if len(bag) >= num_attr // 2: break for attr in query_manager.workloads[i]: id = query_manager.att_id[attr] if id not in bag: attr_size = domain.shape[id] bag[id] = np.random.randint(0, attr_size) arr = [] for _ in range(data_size): arr.append(get_dummy_row(domain, bag)) values = np.array(arr) df = pd.DataFrame(values, columns=domain.attrs) data = Dataset(df, domain) if display: ans = query_manager.get_answer(data) print("max answer: ", np.max(ans)) plot_bins(ans, title='Dummy') return data
def evaluate(): config = Config() valid_data = Dataset(config.root, valid_image_paths, config.type, transform=Augmentaton(size=config.resize, mean=config.means[config.type], std=config.stds[config.type]), max_samples=10) valid_data_loader = DataLoader(valid_data) sess = tf.Session() model = DenseNet(config) logger = Logger(sess, config) trainer = DenseNetTrainer(sess, model, valid_data_loader, valid_data_loader, config, logger) model.load(sess) if config.phase == "train": trainer.train() elif config.phase == "test": trainer.test(output_prediction_path)