def _build_data_loaders(self): transforms = self._get_transforms() # DEFINE: DATASETS train_dataset = Dataset( data_dir=self.data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) test_dataset = Dataset( data_dir=self.test_data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) # DEFINE: DATA LOADER self.train_data_loader = DataLoader( dataset=train_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader = DataLoader( dataset=test_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader_ = iter(self.test_data_loader)
def test(**kwargs): opt.parse(kwargs) if opt.device is not None: opt.device = torch.device(opt.device) elif opt.gpus: opt.device = torch.device(0) else: opt.device = torch.device('cpu') pretrain_model = load_pretrain_model(opt.pretrain_model_path) generator = GEN(opt.dropout, opt.image_dim, opt.text_dim, opt.hidden_dim, opt.bit, pretrain_model=pretrain_model).to(opt.device) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) load_model(generator, path) generator.eval() images, tags, labels = load_data(opt.data_path, opt.dataset) i_query_data = Dataset(opt, images, tags, labels, test='image.query') i_db_data = Dataset(opt, images, tags, labels, test='image.db') t_query_data = Dataset(opt, images, tags, labels, test='text.query') t_db_data = Dataset(opt, images, tags, labels, test='text.db') i_query_dataloader = DataLoader(i_query_data, opt.batch_size, shuffle=False) i_db_dataloader = DataLoader(i_db_data, opt.batch_size, shuffle=False) t_query_dataloader = DataLoader(t_query_data, opt.batch_size, shuffle=False) t_db_dataloader = DataLoader(t_db_data, opt.batch_size, shuffle=False) qBX = generate_img_code(generator, i_query_dataloader, opt.query_size) qBY = generate_txt_code(generator, t_query_dataloader, opt.query_size) rBX = generate_img_code(generator, i_db_dataloader, opt.db_size) rBY = generate_txt_code(generator, t_db_dataloader, opt.db_size) query_labels, db_labels = i_query_data.get_labels() query_labels = query_labels.to(opt.device) db_labels = db_labels.to(opt.device) mapi2t = calc_map_k(qBX, rBY, query_labels, db_labels) mapt2i = calc_map_k(qBY, rBX, query_labels, db_labels) print('...test MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i))
def train(args): train_transforms = transforms.Compose([ transforms.Resize(args.image_shape), transforms.RandomHorizontalFlip(), transforms.Normalize() ]) eval_transforms = transforms.Compose( [transforms.Resize(args.image_shape), transforms.Normalize()]) train_dataset = Dataset( data_dir=args.data_dir, file_list=args.train_list, transforms=train_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=True) eval_dataset = None if args.val_list is not None: eval_dataset = Dataset( data_dir=args.data_dir, file_list=args.val_list, transforms=eval_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=False) if args.model_type == 'HumanSegMobile': model = HumanSegMobile(num_classes=2) elif args.model_type == 'HumanSegLite': model = HumanSegLite(num_classes=2) elif args.model_type == 'HumanSegServer': model = HumanSegServer(num_classes=2) else: raise ValueError( "--model_type: {} is set wrong, it shold be one of ('HumanSegMobile', " "'HumanSegLite', 'HumanSegServer')".format(args.model_type)) model.train( num_epochs=args.num_epochs, train_dataset=train_dataset, train_batch_size=args.batch_size, eval_dataset=eval_dataset, save_interval_epochs=args.save_interval_epochs, save_dir=args.save_dir, pretrained_weights=args.pretrained_weights, resume_weights=args.resume_weights, learning_rate=args.learning_rate, use_vdl=args.use_vdl)
def _build_data_loaders(self): transforms = self._get_transforms() extraimg_transform = torchvision.transforms.Compose( transforms["frame"].transforms[1:]) # DEFINE: DATASETS train_dataset = Dataset( data_dir=self.data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) test_dataset = Dataset( data_dir=self.test_data_dir, transforms=transforms, load_files=self.data_config["load_files"], ) train_extraimg_dataset = IMGDataset( data_dir=self.extraimg_data_dir, data_type=self.extraimg_type, transform=extraimg_transform, ) # DEFINE: DATA LOADER self.train_data_loader = DataLoader( dataset=train_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader = DataLoader( dataset=test_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.train_extraimg_data_loader = DataLoader( dataset=train_extraimg_dataset, batch_size=self.model_config["batch_size"], shuffle=True, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) self.test_data_loader_ = iter(self.test_data_loader)
def gen_s_curve(rng, emissions): """Generate synthetic data from datasets generating process. """ N = 500 J = 100 D = 2 # Generate latent manifold. # ------------------------- X, t = make_s_curve(N, random_state=rng) X = np.delete(X, obj=1, axis=1) X = X / np.std(X, axis=0) inds = t.argsort() X = X[inds] t = t[inds] # Generate kernel `K` and latent GP-distributed maps `F`. # ------------------------------------------------------- K = kern.RBF(input_dim=D, lengthscale=1).K(X) F = rng.multivariate_normal(np.zeros(N), K, size=J).T # Generate emissions using `F` and/or `K`. # ---------------------------------------- if emissions == 'bernoulli': P = logistic(F) Y = rng.binomial(1, P).astype(np.double) return Dataset('s-curve', False, Y, X, F, K, None, t) if emissions == 'gaussian': Y = F + np.random.normal(0, scale=0.5, size=F.shape) return Dataset('s-curve', False, Y, X, F, K, None, t) elif emissions == 'multinomial': C = 100 pi = np.exp(F - logsumexp(F, axis=1)[:, None]) Y = np.zeros(pi.shape) for n in range(N): Y[n] = rng.multinomial(C, pi[n]) return Dataset('s-curve', False, Y, X, F, K, None, t) elif emissions == 'negbinom': P = logistic(F) R = np.arange(1, J + 1, dtype=float) Y = rng.negative_binomial(R, 1 - P) return Dataset('s-curve', False, Y, X, F, K, R, t) else: assert (emissions == 'poisson') theta = np.exp(F) Y = rng.poisson(theta) return Dataset('s-curve', False, Y, X, F, K, None, t)
def __init__(self): dataset = Dataset('config_compas.json') x, y = dataset.get_data(readable=True) # r = "Af_vs_all" r = "Af_vs_Caucasian" # r = "all" x, y = get_dataframe(x, y, requested=r) self.finder = RelationshipsFinder(pd.concat([x, y], axis=1))
def load_bridges(): """Load NYC bridges dataset: https://data.cityofnewyork.us/Transportation/ Bicycle-Counts-for-East-River-Bridges/gua4-p9wg """ data = np.load(f'datasets/bridges.npy', allow_pickle=True) data = data[()] Y = data['Y'] labels = data['labels'] return Dataset('bridges', True, Y, labels=labels)
def main(): config = Config() create_dirs([config.summary_dir, config.checkpoint_dir]) sess = tf.Session() train_data = Dataset(config.root, config.train_image_file, config.type, transform=Augmentaton(size=config.resize, mean=config.means[config.type], std=config.stds[config.type]), max_samples=None) valid_data = Dataset(config.root, config.valid_image_file, config.type, transform=Augmentaton(size=config.resize, mean=config.means[config.type], std=config.stds[config.type]), max_samples=None) train_data_loader = DataLoader(train_data) valid_data_loader = DataLoader(valid_data) model = DenseNet(config) logger = Logger(sess, config) trainer = DenseNetTrainer(sess, model, train_data_loader, valid_data_loader, config, logger) model.load(sess) if config.phase == "train": trainer.train() elif config.phase == "test": trainer.test("prediction.csv")
def main(config_file): """ :param config_file: :return: """ tf.reset_default_graph() with open(config_file) as config_file: config = json.load(config_file) dset = Dataset(config['dset_name'], config['dset_config']) model_file = get_model_file(config) with tf.device(config['device']): model = construct_model(config['dset_name']) attack = construct_attack(model, config, dset) saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore the checkpoint saver.restore(sess, model_file) # Iterate over the samples batch-by-batch num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) x_adv = [] # adv accumulator print('Iterating over {} batches'.format(num_batches)) for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) print('batch size: {}'.format(bend - bstart)) x_batch, y_batch = dset.get_eval_data(bstart, bend) x_batch_adv = attack.perturb(x_batch, y_batch, sess) x_adv.append(x_batch_adv) print('Storing examples') path = data_path_join(config['store_adv_path']) x_adv = np.concatenate(x_adv, axis=0) np.save(path, x_adv) print('Examples stored in {}'.format(path))
def evaluate(args): eval_transforms = transforms.Compose( [transforms.Resize(args.image_shape), transforms.Normalize()]) eval_dataset = Dataset(data_dir=args.data_dir, file_list=args.val_list, transforms=eval_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=False) model = models.load_model(args.model_dir) model.evaluate(eval_dataset, args.batch_size)
def load_congress(): """Congress 109 data: https://github.com/jgscott/STA380/blob/master/data/congress109.csv https://github.com/jgscott/STA380/blob/master/data/congress109members.csv """ df1 = pd.read_csv(f'datasets/congress109.csv') df2 = pd.read_csv(f'datasets/congress109members.csv') assert (len(df1) == len(df2)) # Ensure same ordering. df1 = df1.sort_values(by='name') df2 = df2.sort_values(by='name') Y = df1.values[:, 1:].astype(int) labels = np.array([0 if x == 'R' else 1 for x in df2.party.values]) return Dataset('congress109', True, Y, labels=labels)
def instantiate_net(args, Train=True): module = importlib.import_module('Models.Class' + args['3_model_class']) class_ = getattr(module, args['3_model_class']) my_net = class_() with_GT = True if args['3_dimension'] == '': rescale_factor = 'Default' else: rescale_factor = args['3_dimension'] my_dataset = Dataset() if Train: my_dataset.load_train(args['3_ds'], rescale_factor) my_dataset.load_test(args['3_ds'], rescale_factor) my_net.create_model(args, my_dataset) return my_dataset, my_net
def load_test_dataset(data_dir, syntax, max_example_actions_num): # all with unary closures terminal_vocab_file = os.path.join(data_dir, 'terminal_vocab.txt') grammar_file = os.path.join(data_dir, 'grammar.txt.uc.bin') grammar = deserialize_from_file(grammar_file) terminal_vocab = Vocab( terminal_vocab_file, data=[Constants.UNK_WORD, Constants.EOS_WORD, Constants.PAD_WORD]) vocab = Vocab( os.path.join(data_dir, 'vocab.txt'), data=[Constants.UNK_WORD, Constants.EOS_WORD, Constants.PAD_WORD]) prefix = 'uc_' + syntax + '_' test_dir = os.path.join(data_dir, 'test') test = Dataset(test_dir, 'test', grammar, vocab, terminal_vocab, syntax, max_example_actions_num, True) torch.save(test, test_file)
def evaluate(args): eval_transforms = transforms.Compose( [transforms.Resize((192, 192)), transforms.Normalize()]) eval_dataset = Dataset(data_dir=args.data_dir, file_list=args.quant_list, transforms=eval_transforms, num_workers='auto', buffer_size=100, parallel_method='thread', shuffle=False) model = models.load_model(args.model_dir) model.export_quant_model(dataset=eval_dataset, save_dir=args.save_dir, batch_size=args.batch_size, batch_nums=args.batch_nums)
def train(self, config, word2vec, tokenizer): from datasets.dataset import Dataset, DatasetParam dataset_args = DatasetParam() dataset_args.output_dir = config['data_params']['output_dir'] dataset_args.embed_dim = config['data_params']['embed_dim'] dataset_args.max_sentence_len = config['data_params'][ 'max_sentence_len'] dataset_args.min_word_freq = config['data_params']['min_word_freq'] dataset_args.max_vocab_size = config['data_params']['max_vocab_size'] dataset_args.test_rate = config['data_params']['test_rate'] dataset_args.tokenizer = tokenizer dataset_args.data_dir = config['data_params']['data_dir'] dataset_args.cate_list = config['model_params']['cate_list'] dataset_args.word2vec_iterator = word2vec dataset_args.data_vocab_dir = config['data_params']['data_vocab_dir'] dataset_args.data_vocab_tag = str( config['data_params']['data_vocab_tag']) dataset_args.data_file = config['data_params']['data_file'] dataset = Dataset(dataset_args) train_set, test_set = dataset.buildWithAllData(False) x_train, y_train = zip(*train_set) x_train = np.array(x_train) y_train = np.array(y_train) x_test, y_test = zip(*test_set) x_test = np.array(x_test) y_test = np.array(y_test) # 加载贝叶斯模型 from sklearn.naive_bayes import BernoulliNB from sklearn.externals import joblib classifier = BernoulliNB() # 训练模型并保存 classifier.fit(x_train, y_train) joblib.dump(classifier, os.path.join(dataset_args.output_dir, 'bayes_model.m')) # 验证并计算acc y_ = classifier.predict(x_test) acc = np.mean( [1 if y_[i] == y_test[i] else 0 for i in range(y_test.shape[0])], axis=0) print("eval acc: %f" % acc)
def get_dummy_data(domain, data_size, query_manager=None): dis = {} for attr, n in zip(domain.attrs, domain.shape): random_dist = np.random.exponential(10, n) random_dist = random_dist / np.sum(random_dist) dis[attr] = random_dist arr = [ np.random.choice(n, data_size, p=dis[attr]) for attr, n in zip(domain.attrs, domain.shape) ] values = np.array(arr).T df = pd.DataFrame(values, columns=domain.attrs) data = Dataset(df, domain) if query_manager is not None: ans = query_manager.get_answer(data) print("max answer: ", np.max(ans)) plt.hist(ans) plt.show() return data
def _get_transforms(self): # for normalizer of mel mel_data_loader = None if not os.path.isfile(self.data_config["mel_normalizer_savefile"]): mel_dataset = Dataset( data_dir=self.data_dir, transforms={}, load_files=["log_mel_spec", "mel_if"], ) mel_data_loader = DataLoader( dataset=mel_dataset, batch_size=self.model_config["batch_size"], shuffle=False, pin_memory=True, num_workers=self.model_config["batch_size"] // 2, ) # Data definitions frame_transforms = [ torchvision.transforms.ToPILImage(), torchvision.transforms.Resize((256, 256)), torchvision.transforms.RandomHorizontalFlip(p=0.5), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] if self.model_config["flip"] is not True: flip_transform = frame_transforms.pop(1) assert isinstance(flip_transform, torchvision.transforms.RandomHorizontalFlip) transforms = { "frame": torchvision.transforms.Compose(frame_transforms), "mel": MelNormalizer( dataloader=mel_data_loader, savefile_path=self.data_config["mel_normalizer_savefile"], ), } return transforms
def main(config_file): """ :param config_file: :return: """ # deallocate memory if any tf.reset_default_graph() #free_gpus() # load configs. with open(config_file) as config_file: config = json.load(config_file) # load dataset dset = Dataset(config['dset_name'], config['dset_config']) with tf.device(config['device']): model = construct_model(config['dset_name']) x_adv = np.load(data_path_join(config['store_adv_path'])) model_file = get_model_file(config) num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] target_shape = (num_eval_examples, ) + get_dataset_shape( config['dset_name']) check_values(x_adv, dset.min_value, dset.max_value) check_shape(x_adv, target_shape) res = get_res(model_file, x_adv, config['attack_config']['epsilon'], model, dset, num_eval_examples=num_eval_examples, eval_batch_size=eval_batch_size) return res
def _build_data_loader(data_dir, batch_size=256, sr=16000): transforms = {} if sr != 32000: transforms["audio"] = lambda audio: librosa.resample(audio, sr, 32000) print(f"[!] sr: {sr} -> 32000") # DEFINE: DATASETS train_dataset = Dataset( data_dir=data_dir, transforms=transforms, load_files=["audio"], ) # DEFINE: DATA LOADER train_data_loader = DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=batch_size // 2, ) return train_data_loader
def get_loader(csv_dir, split, resample, slices_per_example, batch_size, num_workers, toy, input_scan, output_scan): """Initialize the data loader""" csv_dir = Path(csv_dir) # Default csv path is csv_dir / train.csv or whatever split is csv_path = str(csv_dir / f'{split}.csv') dataset = Dataset(csv_path=csv_path, split=split, toy=toy, input_scan=input_scan, output_scan=output_scan, resample=resample, num_slices=slices_per_example) loader = data.DataLoader(dataset, batch_size=batch_size, drop_last=False, pin_memory=True, num_workers=num_workers) return loader
def evaluate(): config = Config() valid_data = Dataset(config.root, valid_image_paths, config.type, transform=Augmentaton(size=config.resize, mean=config.means[config.type], std=config.stds[config.type]), max_samples=10) valid_data_loader = DataLoader(valid_data) sess = tf.Session() model = DenseNet(config) logger = Logger(sess, config) trainer = DenseNetTrainer(sess, model, valid_data_loader, valid_data_loader, config, logger) model.load(sess) if config.phase == "train": trainer.train() elif config.phase == "test": trainer.test(output_prediction_path)
def get_dummy_data2(domain, data_size, query_manager, display=False): num_attr = len(domain.attrs) bag = {} for i in range(len(query_manager.workloads)): if len(bag) >= num_attr // 2: break for attr in query_manager.workloads[i]: id = query_manager.att_id[attr] if id not in bag: attr_size = domain.shape[id] bag[id] = np.random.randint(0, attr_size) arr = [] for _ in range(data_size): arr.append(get_dummy_row(domain, bag)) values = np.array(arr) df = pd.DataFrame(values, columns=domain.attrs) data = Dataset(df, domain) if display: ans = query_manager.get_answer(data) print("max answer: ", np.max(ans)) plot_bins(ans, title='Dummy') return data
def main(config_file): np.random.seed(1) tf.reset_default_graph() config = load_config(config_file) # dataset dset_name = config['dset_name'] dset = Dataset(dset_name, config['dset_config']) dset_shape = get_dataset_shape(config['dset_name']) dim = np.prod(dset_shape) # model and computational graph model_file = get_model_file(config) with tf.device(config['device']): model = construct_model(dset_name) grad = tf.gradients(model.xent, model.x_input)[0] flat_grad = tf.reshape(grad, [NUM_SAMPLES, -1]) flat_sgn = tf_nsign(flat_grad) norm_flat_grad = tf.div(flat_grad, tf.norm(flat_grad, axis=1, keepdims=True)) sim_mat = tf.matmul(norm_flat_grad, norm_flat_grad, transpose_b=True) sims = tf.gather_nd(sim_mat, list(zip(*np.triu_indices(NUM_SAMPLES, k=1)))) dist_mat = (dim - tf.matmul(flat_sgn, flat_sgn, transpose_b=True)) / 2.0 dists = tf.gather_nd(dist_mat, list(zip(*np.triu_indices(NUM_SAMPLES, k=1)))) saver = tf.train.Saver() writer = tf.summary.FileWriter( data_path_join("hamming_dist_exp") ) epsilon = config['attack_config']['epsilon'] num_batches = int(math.ceil(NUM_EVAL_EXAMPLES / EVAL_BATCH_SIZE)) for _epsilon in np.linspace(epsilon/10, epsilon, 3): # histogram recorder tf.summary.histogram( "{}_hamming_dist_xr_sgn_grad_eps_{}_{}_samples_{}_pts".format(dset_name, _epsilon, NUM_SAMPLES, NUM_EVAL_EXAMPLES), dists ) tf.summary.histogram( "{}_cosine_sim_xr_grad_eps_{}_{}_samples_{}_pts".format(dset_name, _epsilon, NUM_SAMPLES, NUM_EVAL_EXAMPLES), sims ) summs = tf.summary.merge_all() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore the checkpoint saver.restore(sess, model_file) # Iterate over the data points one-by-one print('Iterating over {} batches'.format(num_batches)) for ibatch in range(num_batches): bstart = ibatch * EVAL_BATCH_SIZE bend = min(bstart + EVAL_BATCH_SIZE, NUM_EVAL_EXAMPLES) print('batch size: {}'.format(bend - bstart)) x_batch, y_batch = dset.get_eval_data(bstart, bend) xr_batch = np.clip( x_batch + np.random.uniform(-_epsilon, _epsilon, [NUM_SAMPLES, *x_batch.shape[1:]]), dset.min_value, dset.max_value ) yr_batch = y_batch.repeat(NUM_SAMPLES) summ_val = sess.run(summs, feed_dict={ model.x_input: xr_batch, model.y_input: yr_batch }) writer.add_summary(summ_val, global_step=ibatch)
state = { 'agent': agent_state_dict, 'epoch': epoch, 'reward': reward, 'dice': dice } torch.save( state, args.cv_dir + '/ckpt_E_%d_D_%.3f_R_%.2E_S_%.2f_#_%d.t7' % (epoch, dice, reward, sparsity, len(policy_set))) torch.save(state, args.cv_dir + '/best.t7') best_dice = 0.0 if __name__ == '__main__': # define dateset train_ds = Dataset(os.path.join(args.data_dir, 'train', 'ct'), os.path.join(args.data_dir, 'train', 'seg')) test_ds = Dataset(os.path.join(args.data_dir, 'test', 'ct'), os.path.join(args.data_dir, 'test', 'seg'), test=True) # define data loader trainloader = DataLoader(train_ds, args.batch_size, shuffle=True, num_workers=0, pin_memory=True) testloader = DataLoader(test_ds, args.batch_size, shuffle=False, num_workers=0, pin_memory=True)
def train(**kwargs): opt.parse(kwargs) if opt.vis_env: vis = Visualizer(opt.vis_env, port=opt.vis_port) if opt.device is None or opt.device is 'cpu': opt.device = torch.device('cpu') else: opt.device = torch.device(opt.device) images, tags, labels = load_data(opt.data_path, type=opt.dataset) train_data = Dataset(opt, images, tags, labels) train_dataloader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True) L = train_data.get_labels() L = L.to(opt.device) # test i_query_data = Dataset(opt, images, tags, labels, test='image.query') i_db_data = Dataset(opt, images, tags, labels, test='image.db') t_query_data = Dataset(opt, images, tags, labels, test='text.query') t_db_data = Dataset(opt, images, tags, labels, test='text.db') i_query_dataloader = DataLoader(i_query_data, opt.batch_size, shuffle=False) i_db_dataloader = DataLoader(i_db_data, opt.batch_size, shuffle=False) t_query_dataloader = DataLoader(t_query_data, opt.batch_size, shuffle=False) t_db_dataloader = DataLoader(t_db_data, opt.batch_size, shuffle=False) query_labels, db_labels = i_query_data.get_labels() query_labels = query_labels.to(opt.device) db_labels = db_labels.to(opt.device) pretrain_model = load_pretrain_model(opt.pretrain_model_path) generator = GEN(opt.dropout, opt.image_dim, opt.text_dim, opt.hidden_dim, opt.bit, opt.num_label, pretrain_model=pretrain_model).to(opt.device) discriminator = DIS(opt.hidden_dim//4, opt.hidden_dim//8, opt.bit).to(opt.device) optimizer = Adam([ # {'params': generator.cnn_f.parameters()}, ## froze parameters of cnn_f {'params': generator.image_module.parameters()}, {'params': generator.text_module.parameters()}, {'params': generator.hash_module.parameters()} ], lr=opt.lr, weight_decay=0.0005) optimizer_dis = { 'feature': Adam(discriminator.feature_dis.parameters(), lr=opt.lr, betas=(0.5, 0.9), weight_decay=0.0001), 'hash': Adam(discriminator.hash_dis.parameters(), lr=opt.lr, betas=(0.5, 0.9), weight_decay=0.0001) } tri_loss = TripletLoss(opt, reduction='sum') loss = [] max_mapi2t = 0. max_mapt2i = 0. max_average = 0. mapt2i_list = [] mapi2t_list = [] train_times = [] B_i = torch.randn(opt.training_size, opt.bit).sign().to(opt.device) B_t = B_i H_i = torch.zeros(opt.training_size, opt.bit).to(opt.device) H_t = torch.zeros(opt.training_size, opt.bit).to(opt.device) for epoch in range(opt.max_epoch): t1 = time.time() e_loss = 0 for i, (ind, img, txt, label) in tqdm(enumerate(train_dataloader)): imgs = img.to(opt.device) txt = txt.to(opt.device) labels = label.to(opt.device) batch_size = len(ind) h_i, h_t, f_i, f_t = generator(imgs, txt) H_i[ind, :] = h_i.data H_t[ind, :] = h_t.data h_t_detach = generator.generate_txt_code(txt) ##### # train feature discriminator ##### D_real_feature = discriminator.dis_feature(f_i.detach()) D_real_feature = -opt.gamma * torch.log(torch.sigmoid(D_real_feature)).mean() # D_real_feature = -D_real_feature.mean() optimizer_dis['feature'].zero_grad() D_real_feature.backward() # train with fake D_fake_feature = discriminator.dis_feature(f_t.detach()) D_fake_feature = -opt.gamma * torch.log(torch.ones(batch_size).to(opt.device) - torch.sigmoid(D_fake_feature)).mean() # D_fake_feature = D_fake_feature.mean() D_fake_feature.backward() # train with gradient penalty alpha = torch.rand(batch_size, opt.hidden_dim//4).to(opt.device) interpolates = alpha * f_i.detach() + (1 - alpha) * f_t.detach() interpolates.requires_grad_() disc_interpolates = discriminator.dis_feature(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones(disc_interpolates.size()).to(opt.device), create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) # 10 is gradient penalty hyperparameter feature_gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * 10 feature_gradient_penalty.backward() optimizer_dis['feature'].step() ##### # train hash discriminator ##### D_real_hash = discriminator.dis_hash(h_i.detach()) D_real_hash = -opt.gamma * torch.log(torch.sigmoid(D_real_hash)).mean() optimizer_dis['hash'].zero_grad() D_real_hash.backward() # train with fake D_fake_hash = discriminator.dis_hash(h_t.detach()) D_fake_hash = -opt.gamma * torch.log(torch.ones(batch_size).to(opt.device) - torch.sigmoid(D_fake_hash)).mean() D_fake_hash.backward() # train with gradient penalty alpha = torch.rand(batch_size, opt.bit).to(opt.device) interpolates = alpha * h_i.detach() + (1 - alpha) * h_t.detach() interpolates.requires_grad_() disc_interpolates = discriminator.dis_hash(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones(disc_interpolates.size()).to(opt.device), create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) hash_gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * 10 hash_gradient_penalty.backward() optimizer_dis['hash'].step() loss_G_txt_feature = -torch.log(torch.sigmoid(discriminator.dis_feature(f_t))).mean() loss_adver_feature = loss_G_txt_feature loss_G_txt_hash = -torch.log(torch.sigmoid(discriminator.dis_hash(h_t_detach))).mean() loss_adver_hash = loss_G_txt_hash tri_i2t = tri_loss(h_i, labels, target=h_t, margin=opt.margin) tri_t2i = tri_loss(h_t, labels, target=h_i, margin=opt.margin) weighted_cos_tri = tri_i2t + tri_t2i i_ql = torch.sum(torch.pow(B_i[ind, :] - h_i, 2)) t_ql = torch.sum(torch.pow(B_t[ind, :] - h_t, 2)) loss_quant = i_ql + t_ql err = opt.alpha * weighted_cos_tri + \ opt.beta * loss_quant + opt.gamma * (loss_adver_feature + loss_adver_hash) optimizer.zero_grad() err.backward() optimizer.step() e_loss = err + e_loss P_i = torch.inverse( L.t() @ L + opt.lamb * torch.eye(opt.num_label, device=opt.device)) @ L.t() @ B_i P_t = torch.inverse( L.t() @ L + opt.lamb * torch.eye(opt.num_label, device=opt.device)) @ L.t() @ B_t B_i = (L @ P_i + opt.mu * H_i).sign() B_t = (L @ P_t + opt.mu * H_t).sign() loss.append(e_loss.item()) print('...epoch: %3d, loss: %3.3f' % (epoch + 1, loss[-1])) delta_t = time.time() - t1 if opt.vis_env: vis.plot('loss', loss[-1]) # validate if opt.valid and (epoch + 1) % opt.valid_freq == 0: mapi2t, mapt2i = valid(generator, i_query_dataloader, i_db_dataloader, t_query_dataloader, t_db_dataloader, query_labels, db_labels) print('...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (epoch + 1, mapi2t, mapt2i)) mapi2t_list.append(mapi2t) mapt2i_list.append(mapt2i) train_times.append(delta_t) if 0.5 * (mapi2t + mapt2i) > max_average: max_mapi2t = mapi2t max_mapt2i = mapt2i max_average = 0.5 * (mapi2t + mapt2i) save_model(generator) if opt.vis_env: vis.plot('mapi2t', mapi2t) vis.plot('mapt2i', mapt2i) if epoch % 100 == 0: for params in optimizer.param_groups: params['lr'] = max(params['lr'] * 0.8, 1e-6) if not opt.valid: save_model(generator) print('...training procedure finish') if opt.valid: print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (max_mapi2t, max_mapt2i)) else: mapi2t, mapt2i = valid(generator, i_query_dataloader, i_db_dataloader, t_query_dataloader, t_db_dataloader, query_labels, db_labels) print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i)) path = 'checkpoints/' + opt.dataset + '_' + str(opt.bit) with open(os.path.join(path, 'result.pkl'), 'wb') as f: pickle.dump([train_times, mapi2t_list, mapt2i_list], f)
def main(config_file): np.random.seed(1) tf.reset_default_graph() config = load_config(config_file) dset_name = config['dset_name'] dset = Dataset(dset_name, config['dset_config']) model_file = get_model_file(config) epsilon = config['attack_config']['epsilon'] with tf.device(config['device']): model = construct_model(dset_name) abs_grad = tf.abs(tf.gradients(model.xent, model.x_input)[0]) # histogram recorder # place holder for dx at x0 and x_rand dxo = tf.placeholder(tf.float32, shape=get_dataset_shape(dset_name)) tf.summary.histogram("{}_part_deriv_mag_xo".format(dset_name), dxo) dxr = tf.placeholder(tf.float32, shape=get_dataset_shape(dset_name)) tf.summary.histogram("{}_part_deriv_mag_xr".format(dset_name), dxr) writer = tf.summary.FileWriter( data_path_join("partial_derivative_exp") ) summaries = tf.summary.merge_all() saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore the checkpoint saver.restore(sess, model_file) # Iterate over the samples batch-by-batch eval_batch_size = config['eval_batch_size'] num_batches = int(math.ceil(NUM_EVAL_EXAMPLES / eval_batch_size)) #dxs = None # grads accumulator print('Iterating over {} batches'.format(num_batches)) for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, NUM_EVAL_EXAMPLES) print('batch size: {}'.format(bend - bstart)) x_batch, y_batch = dset.get_eval_data(bstart, bend) xr_batch = np.clip(x_batch + np.random.uniform(-epsilon, epsilon, x_batch.shape), dset.min_value, dset.max_value) #print(y_batch) dxo_batch = sess.run(abs_grad, feed_dict={ model.x_input: x_batch, model.y_input: y_batch }) dxr_batch = sess.run(abs_grad, feed_dict={ model.x_input: xr_batch, model.y_input: y_batch }) for i, step in enumerate(range(bstart, bend)): summ = sess.run(summaries, feed_dict={dxo: dxo_batch[i], dxr: dxr_batch[i]}) writer.add_summary(summ, global_step=step)
def generate(real_answers: np.array, N: int, domain: Domain, query_manager: QueryManager, epsilon: float, delta: float, epsilon_split: float, noise_multiple: float, samples: int, alpha=0, show_prgress=True): assert epsilon_split > 0 assert noise_multiple > 0 neg_real_answers = 1 - real_answers D = np.sum(domain.shape) Q_size = query_manager.num_queries prev_queries = [] neg_queries = [] final_oh_fake_data = [] # stores the final data ''' Calculate the total number of rounds using advance composition ''' T, epsilon_0 = get_iters(epsilon, delta, epsilon_split) # print(f'epsilon_0 = {epsilon_0}') exponential_scale = np.sqrt(T) * noise_multiple # print(f'epsilon_0 = {epsilon_0}') if show_prgress: progress_bar = tqdm(total=T) for t in range(T): """ Sample s times from FTPL """ util2.blockPrint() num_processes = 8 s2 = int(1.0 + samples / num_processes) samples_rem = samples processes = [] manager = mp.Manager() fake_temp = manager.list() query_workload, q_weights = query_manager.get_query_workload_weighted( prev_queries) neg_query_workload, n_weights = query_manager.get_query_workload_weighted( neg_queries) for __ in range(num_processes): temp_s = samples_rem if samples_rem - s2 < 0 else s2 samples_rem -= temp_s noise = np.random.exponential(exponential_scale, (temp_s, D)) proc = mp.Process(target=gen_fake_data, args=(fake_temp, query_workload, q_weights, neg_query_workload, n_weights, noise, domain, alpha, temp_s)) proc.start() processes.append(proc) assert samples_rem == 0, "samples_rem = {}".format(samples_rem) for p in processes: p.join() util2.enablePrint() oh_fake_data = [] assert len(fake_temp) > 0 for x in fake_temp: oh_fake_data.append(x) final_oh_fake_data.append(x) assert len(oh_fake_data ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format( len(oh_fake_data), len(fake_temp)) for i in range(samples): assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format( len(oh_fake_data[0])) # assert not final_oh_fake_data or len(final_oh_fake_data[0][1]) == D, "D_hat dim = {}".format(len(oh_fake_data[0])) fake_data = Dataset( pd.DataFrame(util2.decode_dataset(oh_fake_data, domain), columns=domain.attrs), domain) """ Compute Exponential Mechanism distribution """ fake_answers = query_manager.get_answer(fake_data) neg_fake_answers = 1 - fake_answers score = np.append(real_answers - fake_answers, neg_real_answers - neg_fake_answers) EM_dist_0 = np.exp(epsilon_0 * score * N / 2, dtype=np.float128) sum = np.sum(EM_dist_0) assert sum > 0 assert not np.isinf(sum) EM_dist = EM_dist_0 / sum assert not np.isnan( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) assert not np.isinf( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) """ Sample from EM """ q_t_ind = util2.sample(EM_dist) if q_t_ind < Q_size: prev_queries.append(q_t_ind) else: neg_queries.append(q_t_ind - Q_size) if show_prgress: progress_bar.update() progress_bar.set_postfix({ 'max error': f'{np.max(score):.3f}', 'round error': f'{score[q_t_ind]:.3f}' }) if show_prgress: progress_bar.close() final_fem_data = Dataset( pd.DataFrame(util2.decode_dataset(final_oh_fake_data, domain), columns=domain.attrs), domain) return final_fem_data
def load_dataset(config, force_regenerate=False): dj_dir = './preprocessed/django' logging.info('=' * 80) logging.info('Loading datasets from folder ' + dj_dir) logging.info('=' * 80) train, test, dev = None, None, None prefix = config.syntax + '_' if config.unary_closures: prefix += 'uc_' train_dir = os.path.join(dj_dir, 'train') train_file = os.path.join(train_dir, prefix + 'train.pth') if not force_regenerate and os.path.isfile(train_file): logging.info('Train dataset found, loading...') train = torch.load(train_file) train.config = config test_dir = os.path.join(dj_dir, 'test') test_file = os.path.join(test_dir, prefix + 'test.pth') if not force_regenerate and os.path.isfile(test_file): logging.info('Test dataset found, loading...') test = torch.load(test_file) test.config = config dev_dir = os.path.join(dj_dir, 'dev') dev_file = os.path.join(dev_dir, prefix + 'dev.pth') if not force_regenerate and os.path.isfile(dev_file): logging.info('Dev dataset found, loading...') dev = torch.load(dev_file) dev.config = config if train is None or test is None or dev is None: terminal_vocab_file = os.path.join(dj_dir, 'terminal_vocab.txt') if config.unary_closures: grammar_file = os.path.join(dj_dir, 'grammar.txt.uc.bin') else: grammar_file = os.path.join(dj_dir, 'grammar.txt.bin') grammar = deserialize_from_file(grammar_file) terminal_vocab = Vocab( terminal_vocab_file, data=[Constants.UNK_WORD, Constants.EOS_WORD, Constants.PAD_WORD]) vocab = Vocab( os.path.join(dj_dir, 'vocab.txt'), data=[Constants.UNK_WORD, Constants.EOS_WORD, Constants.PAD_WORD]) if test is None: logging.info('Test dataset not found, generating...') test = Dataset(test_dir, 'test', grammar, vocab, terminal_vocab, config.syntax, config.max_example_action_num, config.unary_closures) torch.save(test, test_file) if dev is None: logging.info('Dev dataset not found, generating...') dev = Dataset(dev_dir, 'dev', grammar, vocab, terminal_vocab, config.syntax, config.max_example_action_num, config.unary_closures) torch.save(dev, dev_file) if train is None: logging.info('Train dataset not found, generating...') train = Dataset(train_dir, 'train', grammar, vocab, terminal_vocab, config.syntax, config.max_example_action_num, config.unary_closures) torch.save(train, train_file) train.prepare_torch(config.cuda) dev.prepare_torch(config.cuda) test.prepare_torch(config.cuda) return train, dev, test
from plotly.subplots import make_subplots from datasets.dataset import Dataset, DatasetConfig from graphics.graphs import draw_data_points, draw_loss_function, prepare_frame from losses.loss_function import Loss from models import linear if __name__ == '__main__': # Generate the dataset dataset = Dataset(conf=DatasetConfig.load('apartment_prices')) # Build theoretical loss function loss_function = Loss(dataset=dataset, use_intercept=False) # Train the model a_hist, loss_hist = linear.train(dataset, epochs=100, lr=0.0004, early_stopping_delta=100) fig = make_subplots(rows=1, cols=2) draw_data_points(dataset=dataset, figure=fig) draw_loss_function(loss_function=loss_function, figure=fig) fig.update( frames=[prepare_frame(a, loss) for a, loss in zip(a_hist, loss_hist)]) fig.update_layout(updatemenus=[ dict(type="buttons", buttons=[dict(label="Train", method="animate", args=[None])]) ], showlegend=False) fig.show()
store_name = os.path.join(data_dir, '{}_tbl.h5'.format(exp_id)) offset = 0 # rewrite all the results alternatively one could make use of `offset` to append to the h5 file above. if os.path.exists(store_name): os.remove(store_name) for _cf in cfs: # for reproducibility np.random.seed(1) config_file = config_path_join(_cf) tf.reset_default_graph() with open(config_file) as config_file: config = json.load(config_file) dset = Dataset(config['dset_name'], config['dset_config']) dset_dim = np.prod(get_dataset_shape(config['dset_name'])) model_file = get_model_file(config) with tf.device(config['device']): model = construct_model(config['dset_name']) flat_est_grad = tf.placeholder(tf.float32, shape=[None, dset_dim]) flat_grad = tf.reshape( tf.gradients(model.xent, model.x_input)[0], [-1, dset_dim]) norm_flat_grad = tf.maximum( tf.norm(flat_grad, axis=1, keepdims=True), np.finfo(np.float64).eps) norm_flat_est_grad = tf.maximum( tf.norm(flat_est_grad, axis=1, keepdims=True), np.finfo(np.float64).eps) cos_sim = tf.reduce_sum(tf.multiply(