def data_generator(dataset, batch_size, shuffle): dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=True, num_workers=0) for _x, _y, _B in dataloader: x = _x[:, None].expand(_x.shape[0], 3, _x.shape[1], _x.shape[2]).float() / 255. y = _y.squeeze(1) b = _B.squeeze(1).float() / 200 yield cuda(x), cuda(y), cuda(b)
def train(model, optimizer, train_iter, vocab_size, grad_clip, padding_idx): model.train( ) # put model in train mode (this is important because of dropout) optimizer.zero_grad() total_loss = 0 for batch in train_iter: # calculate model predictions question, answer = cuda(batch.question), cuda(batch.answer) outputs = model(question, answer) # calculate loss and backpropagate errors loss = F.nll_loss( outputs.view(-1, vocab_size), answer[1:].view(-1), ignore_index=padding_idx) # answer[1:] skip <sos> token loss.backward() total_loss += loss.data[0] # clip gradients to avoid exploding gradient clip_grad_norm(model.parameters(), grad_clip) # update parameters optimizer.step() return total_loss / len(train_iter)
def preprocess_mnistmulti(item): _x, _y, _B = item x = _x[:, None].expand(_x.shape[0], 3, _x.shape[1], _x.shape[2]).float() / 255. y = _y.sort(dim=-1)[0] n_digits = y.shape[1] new_y = y[:, 0] for digit in range(1, n_digits): new_y = new_y * 10 + y[:, digit] y = new_y b = _B.squeeze(1).float() return cuda(x), cuda(y), cuda(b)
def main(): vocab, train_iter, val_iter, test_iter = dataset_factory( 'twitter-customer-support') epochs = 100 embedding_size = 20 hidden_size = 100 vocab_size = len(vocab) padding_idx = vocab.stoi['<pad>'] encoder = Encoder(vocab_size, embedding_size, hidden_size) decoder = Decoder(vocab_size, embedding_size, hidden_size) seq2seq = cuda(Seq2Seq(encoder, decoder, vocab_size)) optimizer = optim.Adam(seq2seq.parameters()) best_val_loss = None for epoch in range(epochs): # calculate train and val loss train_loss = train(seq2seq, optimizer, train_iter, vocab_size, 5, padding_idx) val_loss = evaluate(seq2seq, val_iter, vocab_size, padding_idx) print("[Epoch=%d] train_loss %f - val_loss %f" % (epoch, train_loss, val_loss)) # save model if model achieved best val loss if not best_val_loss or val_loss < best_val_loss: print('Saving model...') save_model(seq2seq, epoch, val_loss) best_val_loss = val_loss
def evaluate(model, val_iter, vocab_size, padding_idx): model.eval( ) # put model in eval mode (this is important because of dropout) total_loss = 0 for batch in val_iter: # calculate model predictions question, answer = cuda(batch.question), cuda(batch.answer) outputs = model(question, answer) # calculate batch loss loss = F.nll_loss( outputs.view(-1, vocab_size), answer[1:].view(-1), ignore_index=padding_idx) # answer[1:] skip <sos> token total_loss += loss.data[0] return total_loss / len(val_iter)
def forward(self, src, trg): batch_size = src.size(1) trg_seq_len = trg.size( 0 ) - 1 # - 1 because first token in every sequence is <sos> TODO note this in docs (dimensions don't match because we subtracted 1 from seq_len) outputs = cuda( Variable(torch.zeros(trg_seq_len, batch_size, self.vocab_size))) encoder_outputs, h_n = self.encoder(src) hidden = h_n # output of all encoder layers for t=seq_len input_word = cuda( Variable(trg.data[0], requires_grad=False) ) # sos for whole batch TODO check if we need to wrap tensor in new variable or just call trg[0] on existing variable, what's the difference? for t in range(trg_seq_len): output, hidden = self.decoder(input_word, hidden) outputs[t] = output max, argmax = output.data.max(dim=1) input_word = cuda(Variable(argmax)) return outputs
def __init__(self, embeddings, ignore_idx): super().__init__() voc_size = embeddings.shape[0] # Compute similarities print("Computing word similarities...") similarities = [] for i in tqdm(range(voc_size)): similarities.append( F.cosine_similarity(embeddings[i].expand_as(embeddings), embeddings)) similarities = cuda(torch.stack(similarities)) # Ignore padding index penalties similarities[ignore_idx] = torch.zeros(voc_size) self.similarities = similarities
def forward(self, t, row, col, lvl=None): """ row and col is not required here. (unified interface with resolution regularizer) """ if lvl is None: lvl = self.n_levels pc_par = [] pc_chd = [] for l in range(1, lvl + 1): current_level = noderange(self.n_branches, l) for k, i in enumerate(current_level): pc_par.append(t[(i - 1) // self.n_branches].b) pc_chd.append(t[i].b) loss_pc = F_reg_pc(T.stack(pc_par, 1), T.stack(pc_chd, 1)) if lvl >= 1 else cuda(T.zeros(1)) #x.new(1).zero_() return self.coef * loss_pc
def Train_dis_BCE(netD, netG, real_loader, epochs=1, out=None): best_loss = np.Inf for _ in range(epochs): for i, real in enumerate(real_loader): size = len(real) fake = netG.sample(size) data = util.Variable(T.cat([fake, util.cuda(real)], dim=0)) label = util.Variable(T.cat([T.zeros(size, 1), T.ones(size, 1)])) netD.optim.zero_grad() loss = netD.BCELoss(data, label) loss.backward() netD.optim.step() if i % 10 == 0 and i != 0: for param_group in netD.optim.param_groups: param_group['lr'] *= (1 - 0.03) if out and loss.data[0] < best_loss: T.save(netD.state_dict(), out + ".pkg") best_loss = loss.data[0] return loss.data[0]
def __init__(self, stop_idcs, embeddings, ignore_idx, N=5, normalization=True): super().__init__() voc_size = embeddings.shape[0] all_targets = [] print("Computing word similarities...") for word_idx in tqdm(range(voc_size)): target = torch.zeros(voc_size) if word_idx != ignore_idx: if word_idx not in stop_idcs: embedding = embeddings[word_idx] # Compute similarities similarities = F.cosine_similarity( embedding.expand_as(embeddings), embeddings) # Get top N word neighbors with their similarities similarities, indices = torch.sort(similarities, descending=True) indices = indices[:N] similarities = similarities[:N] # Normalize computed similarities if normalization: normalization_factor = torch.sum(similarities) else: normalization_factor = 1 weights = similarities / normalization_factor for i, idx in enumerate(indices): target[idx] = weights[i] else: # Ignore padding index penalties target[word_idx] = 1 all_targets.append(target) soft_targets = cuda(torch.stack(all_targets)) self.soft_targets = soft_targets
def __init__(self, hidden_size, voc_size, padding_idx, init_idx, max_len, embeddings=None, embedding_dim=300): super().__init__() # Sizes if embeddings is not None: self.embedding_dim = embeddings.shape[1] else: self.embedding_dim = embedding_dim self.hidden_size = hidden_size self.voc_size = voc_size self.max_len = max_len # Indices self.init_idx = init_idx self.padding_idx = padding_idx # Layers if embeddings is not None: self.embeddings = cuda(embeddings) self.emb = nn.Embedding.from_pretrained(self.embeddings, freeze=True) else: self.emb = nn.Embedding(self.voc_size, self.embedding_dim) self.enc = nn.LSTM(self.embedding_dim, self.hidden_size, batch_first=True) self.dec = nn.LSTMCell(self.embedding_dim, self.hidden_size) self.lin = nn.Linear(self.hidden_size, self.voc_size) self.dropout = nn.Dropout(p=0.5)
hacks around the restriction from cleverhans that requires a 2D logits tensor ''' def __init__(self, model): T.nn.Module.__init__(self) self.model = model def forward(self, x): y = self.model(x) if y.dim() == 3: return y.squeeze(1) else: return y #model = cuda(DFSGlimpseSingleObjectClassifier()) model = cuda(tvmodels.ResNet(tvmodels.resnet.BasicBlock, [2, 2, 2, 2], 10)) model.load_state_dict(T.load('model.pt')) s = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 3, 200, 200)) tf_model_fn = convert_pytorch_model_to_tf(cuda(TemporaryModule(model))) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') fgsm_op = FastGradientMethod(cleverhans_model, sess=s) fgsm_params = {'eps': 0.01, 'clip_min': 0, 'clip_max': 1} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) preds_op = tf_model_fn(x_op) total = 0
def kl_temperature(y, lbl, temperature=0.01): batch_size = y.shape[0] n_classes = y.shape[1] y_logit = cuda(T.zeros(batch_size, n_classes)) y_logit.scatter_(1, lbl.unsqueeze(-1), 1) return F.kl_div(F.log_softmax(y), F.softmax(y_logit / temperature), size_average=False) / batch_size
def preprocess_bird(item): _x, _y = item return cuda(_x), cuda(_y.squeeze(1)), None
def preprocess_imagenet(item): _x, _y, _ = item return cuda(_x), cuda(_y.squeeze(1)), None
def preprocess_cifar10(item): _x, _y = item return cuda(_x), cuda(_y), None
def forward(self, t, row, col, lvl=None): """ row and col is not required here. (unified interface with resolution regularizer) """ if lvl is None: lvl = self.n_levels cc_chd_a = [] cc_chd_b = [] for l in range(1, lvl + 1): current_level = noderange(self.n_branches, l) for k, i in enumerate(current_level): if (k + 1) % self.n_branches == 0: for i, j in itertools.combinations(range(k - self.n_branches + 1, k + 1), 2): cc_chd_a.append(t[current_level[i]].b) cc_chd_b.append(t[current_level[j]].b) loss_cc = F_reg_cc(T.stack(cc_chd_a, 1), T.stack(cc_chd_b, 1)) if lvl >= 1 and self.n_branches > 1 else cuda(T.zeros(1)) #x.new(1).zero_() return self.coef * loss_cc
expr_setting, args.resume)) else: regularizer_classes = { PCRegularizer: args.pc_coef, CCRegularizer: args.cc_coef, ResRegularizer: args.res_coef } network_params = NETWORK_PARAMS[args.dataset] builder = cuda( nn.DataParallel( TreeBuilder(n_branches=n_branches, n_levels=n_levels, n_classes=network_params['n_classes'], share=args.share, regularizer_classes=regularizer_classes, glimpse_type=args.glm_type, glimpse_size=(args.glm_size, args.glm_size), fm_target_size=network_params['fm_target_size'], final_pool_size=network_params['final_pool_size'], final_n_channels=network_params['final_n_channels'], what__cnn=network_params['cnn'], what__fix=args.fix, what__in_dims=network_params['in_dims']))) readout = cuda( nn.DataParallel( create_readout(args.readout, share=args.share, final_n_channels=network_params['final_n_channels'], n_branches=n_branches, n_levels=n_levels, n_classes=network_params['n_classes'])))
with open(w2vec_loc, 'rb') as f: w2vec = pickle.load(f) else: w2vec = {} print("Loading word vectors...") with open(fasttext_loc) as f: f.__next__() for line in tqdm(f): items = line.strip().split(' ') token = items[0] vector = np.array(items[1:]).astype(float) w2vec[token] = vector with open(w2vec_loc, 'wb') as f: pickle.dump(w2vec, f) dim = len(random.choice(list(w2vec.values()))) embeddings = cuda( torch.FloatTensor(match_embeddings(idx2w, w2vec, dim, bigram))) # DATASET # dataset_train = AutoencoderDataset(train_loc, voc, max_len, bigram=bigram) dataset_dev = AutoencoderDataset(dev_loc, voc, max_len, bigram=bigram) dataloader_train = DataLoader(dataset_train, batch_size, shuffle=True) dataloader_dev = DataLoader(dataset_dev, batch_size, shuffle=True) dataloaders = {'train': dataloader_train, 'dev': dataloader_dev} # MODEL # model = cuda( Autoencoder(hidden_size, voc_size, pad_idx, init_idx,
glimpse_size=(15, 15), n_glimpses=n_glimpses), cnn, ) #module = cnn else: #cnn = miniresnet20(num_classes=10) #cnn = getattr(pytorch_cifar.models, args.cnn)(1000) cnn = getattr(torchvision.models, args.cnn)(pretrained=True) cnn.fc = T.nn.Linear(512 * 4, 120) module = T.nn.DataParallel( T.nn.Sequential( #MultiscaleGlimpse(glimpse_type='gaussian', glimpse_size=(50, 50), n_glimpses=n_glimpses), cnn, )) module = cuda(module) #module.load_state_dict(T.load('cnn.pt')) #module.load_state_dict(dfs.update_module.cnn.state_dict()) ''' net = skorch.NeuralNetClassifier( module=module, #module=CNN, #module__cnn='cnn', #module__input_size=(15, 15), #module__h_dims=128, #module__n_classes=10, #module__kernel_size=(3, 3), #module__final_pool_size=(2, 2), #module__filters=[16, 32, 64, 128, 256], criterion=T.nn.CrossEntropyLoss, max_epochs=50,
y = np.maximum(np.minimum(val.numpy(), 50), -50) x = np.arange(len(y)) ax[i, j].plot(x, y, color='b') ax[i, j].plot(x, [0] * len(y), color='g') ax[i, j].set_title(title) def display_image(fig, ax, i, j, image, title): ax[i, j].imshow(image, cmap='gray') ax[i, j].set_title(title) if os.path.exists('grad_cnn.pt'): cnn = T.load('grad_cnn.pt') net_h = T.load('grad_net_h.pt') #plt.subplots_adjust(wspace=0, hspace =100) valid_loader = data_generator(mnist_valid, 1, valid_shuffle) whole_glim = cuda(T.tensor([[0.5, 0.5, 1.0, 1.0, 0.5, 0.5]])) cnt = 0 for x, y, b in valid_loader: glim = bbox_to_glimpse(b) grads = [] losses = [] for j in np.linspace(0, 1, 41): new_glim = glim * j + whole_glim * (1 - j) new_glim.requires_grad = True g = glimpse(x, new_glim.unsqueeze(1))[:, 0] if j == 0.: g_first = g[0][0].detach().cpu() if j == 1.: g_last = g[0][0].detach().cpu() out = net_h(cnn(g).view(1, -1)) loss = F.cross_entropy(
n_digits=1, backrand=0, image_rows=size, image_cols=size, download=True) n_glimpses = 3 glimpse = MultiscaleGlimpse(glimpse_type='gaussian', glimpse_size=(15, 15), n_glimpses=n_glimpses) module = cuda( CNN(cnn='cnn', input_size=(15, 15), h_dims=128, n_classes=10, kernel_size=(3, 3), final_pool_size=(1, 1), filters=[16, 32, 64, 128, 256], pred=True, in_channels=3, n_patches=n_glimpses, coalesce_mode='sample')) seq = T.nn.Sequential(glimpse, module) seq.load_state_dict(T.load('cnntest.pt')) rec = [] for i in range(100): x = cuda(mnist.train_data[i:i + 1, None].repeat(1, 3, 1, 1).float() / 255.) y = cuda(mnist.train_labels[i:i + 1, 0]) b = cuda(T.zeros(1, 6)) b.requires_grad = True
if args.dataset == 'imagenet': n_classes = 1000 cnn = 'resnet18' elif args.dataset == 'cifar10': n_classes = 10 cnn = None elif args.dataset.startswith('mnist'): n_classes = 10**args.n_digits cnn = None builder = cuda( TreeBuilder( n_branches=n_branches, n_levels=n_levels, att_type=args.att_type, pc_coef=args.pc_coef, cc_coef=args.cc_coef, n_classes=n_classes, glimpse_type=args.glm_type, glimpse_size=GLIMPSE_SIZE, cnn=cnn, )) readout = cuda( ReadoutModule(n_branches=n_branches, n_levels=n_levels, n_classes=n_classes)) batch_size = args.batch_size builder.load_state_dict( T.load('checkpoints/{}_builder_best.pt'.format(expr_setting))) readout.load_state_dict( T.load('checkpoints/{}_readout_best.pt'.format(expr_setting)))