def initialize_model(gpu, vocab_size, v_vec, emb_requires_grad, args): emb_dim = args.emb_dim h_dim = None class_num = 2 is_gpu = True if gpu == -1: is_gpu = False if args.emb_type == 'ELMo' or args.emb_type == 'ELMoForManyLangs': bilstm = BiLSTM(emb_dim, h_dim, class_num, vocab_size, is_gpu, v_vec, emb_type=args.emb_type, elmo_model_dir=args.emb_path) elif args.emb_type == 'None': bilstm = BiLSTM(emb_dim, h_dim, class_num, vocab_size, is_gpu, v_vec, emb_type=args.emb_type) else: bilstm = BiLSTM(emb_dim, h_dim, class_num, vocab_size, is_gpu, v_vec, emb_type=args.emb_type) if is_gpu: bilstm = bilstm.cuda() for m in bilstm.modules(): print(m.__class__.__name__) weights_init(m) if args.emb_type != 'ELMo' and args.emb_type != 'ELMoForManyLangs' and args.emb_type != 'None': for param in bilstm.word_embed.parameters(): param.requires_grad = emb_requires_grad return bilstm
def initialize_model(gpu, vocab_size, v_vec, dropout_ratio, n_layers, model, statistics_of_each_case_type): is_gpu = True if gpu == -1: is_gpu = False if model == 'Base' or model == 'FT': bilstm = BiLSTM(vocab_size, v_vec, dropout_ratio, n_layers, gpu=is_gpu) elif model == 'OneH': bilstm = OneHot(vocab_size, v_vec, dropout_ratio, n_layers, gpu=is_gpu) elif model == 'FA': bilstm = FeatureAugmentation(vocab_size, v_vec, dropout_ratio, n_layers, gpu=is_gpu) elif model == 'CPS': bilstm = ClassProbabilityShift( vocab_size, v_vec, dropout_ratio, n_layers, statistics_of_each_case_type=statistics_of_each_case_type, gpu=is_gpu) elif model == 'MIX': bilstm = Mixture( vocab_size, v_vec, dropout_ratio, n_layers, statistics_of_each_case_type=statistics_of_each_case_type, gpu=is_gpu) if is_gpu: bilstm = bilstm.cuda() for m in bilstm.modules(): print(m.__class__.__name__) weights_init(m) return bilstm
embeddings=embeddings, hidden_size=hidden_size, num_labels=len(vocab), #num_labels, bidirectional=bidirectional, num_layers=num_layers, color_representation_size=54) #54) model_id = str(int(time.time())) + "w_fourier" save_path = os.path.join(output_path, model_id) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) if cuda: model.cuda() print(model) print(str(datetime.now()), 'Generating batches') train_batches = BatchIterator(train_colors, vocab, batch_size, cuda=cuda) test_batches = BatchIterator(test_colors, vocab, batch_size, cuda=cuda) #optimizer = torch.optim.Adam(model.parameters(), lr=0.05) optimizer = torch.optim.Adagrad(model.parameters(), lr=0.5) pbar = tqdm.trange(epochs, desc='Training...') delta = 0 delta_test = 0
class Seq_MNIST_Trainer(): def __init__(self, trainer_params, args): self.args = args self.trainer_params = trainer_params random.seed(trainer_params.random_seed) torch.manual_seed(trainer_params.random_seed) if args.cuda: torch.cuda.manual_seed_all(trainer_params.random_seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} self.train_data = seq_mnist_train(trainer_params) self.val_data = seq_mnist_val(trainer_params) self.train_loader = DataLoader(self.train_data, batch_size=trainer_params.batch_size, shuffle=True, **kwargs) self.val_loader = DataLoader(self.val_data, batch_size=trainer_params.test_batch_size, shuffle=True, **kwargs) self.starting_epoch = 1 self.prev_loss = 10000 self.model = BiLSTM(trainer_params) self.criterion = wp.CTCLoss(size_average=True) self.labels = [i for i in range(trainer_params.num_classes-1)] self.decoder = seq_mnist_decoder(labels=self.labels) if args.resume or args.eval or args.export: print("Loading model from {}".format(args.save_path)) package = torch.load(args.save_path, map_location=lambda storage, loc: storage) self.model.load_state_dict(package['state_dict']) if args.cuda: torch.cuda.set_device(args.gpus) self.model = self.model.cuda() self.optimizer = optim.Adam(self.model.parameters(), lr=trainer_params.lr) if args.resume: self.optimizer.load_state_dict(package['optim_dict']) self.starting_epoch = package['starting_epoch'] self.prev_loss = package['prev_loss'] if args.cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.init_bn_fc_fusion: if not trainer_params.prefused_bn_fc: self.model.batch_norm_fc.init_fusion() self.trainer_params.prefused_bn_fc = True else: raise Exception("BN and FC are already fused.") def serialize(self, model, trainer_params, optimizer, starting_epoch, prev_loss): package = {'state_dict': model.state_dict(), 'trainer_params': trainer_params, 'optim_dict' : optimizer.state_dict(), 'starting_epoch' : starting_epoch, 'prev_loss': prev_loss } return package def save_model(self, epoch, loss_value): print("Model saved at: {}\n".format(self.args.save_path)) self.prev_loss = loss_value torch.save(self.serialize(model=self.model, trainer_params=self.trainer_params, optimizer=self.optimizer, starting_epoch=epoch + 1, prev_loss=self.prev_loss), self.args.save_path) def train(self, epoch): self.model.train() for i, (item) in enumerate(self.train_loader): data, labels, output_len, lab_len = item data = Variable(data.transpose(1,0), requires_grad=False) labels = Variable(labels.view(-1), requires_grad=False) output_len = Variable(output_len.view(-1), requires_grad=False) lab_len = Variable(lab_len.view(-1), requires_grad=False) if self.args.cuda: data = data.cuda() output = self.model(data) # print("Input = ", data.shape) # print("model output (x) = ", output) # print("GTs (y) = ", labels.type()) # print("model output len (xs) = ", output_len.type()) # print("GTs len (ys) = ", lab_len.type()) # exit(0) loss = self.criterion(output, labels, output_len, lab_len) loss_value = loss.data[0] print("Loss value for epoch = {}/{} and batch {}/{} is = {:.4f}".format(epoch, self.trainer_params.epochs, (i+1)*self.trainer_params.batch_size, len(self.train_data) , loss_value)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.args.cuda: torch.cuda.synchronize() def test(self, epoch=0, save_model_flag=False): self.model.eval() loss_value = 0 for i, (item) in enumerate(self.val_loader): data, labels, output_len, lab_len = item data = Variable(data.transpose(1,0), requires_grad=False) labels = Variable(labels.view(-1), requires_grad=False) output_len = Variable(output_len.view(-1), requires_grad=False) lab_len = Variable(lab_len.view(-1), requires_grad=False) if self.args.cuda: data = data.cuda() output = self.model(data) # print("Input = ", data) # print("model output (x) = ", output.shape) # print("model output (x) = ", output) # print("Label = ", labels) # print("model output len (xs) = ", output_len) # print("GTs len (ys) = ", lab_len) index = random.randint(0,self.trainer_params.test_batch_size-1) label = labels[index*self.trainer_params.word_size:(index+1)*self.trainer_params.word_size].data.numpy() label = label-1 prediction = self.decoder.decode(output[:,index,:], output_len[index], lab_len[index]) accuracy = self.decoder.hit(prediction, label) print("Sample Label = {}".format(self.decoder.to_string(label))) print("Sample Prediction = {}".format(self.decoder.to_string(prediction))) print("Accuracy on Sample = {:.2f}%\n\n".format(accuracy)) loss = self.criterion(output, labels, output_len, lab_len) loss_value += loss.data.numpy() loss_value /= (len(self.val_data)//self.trainer_params.test_batch_size) print("Average Loss Value for Val Data is = {:.4f}\n".format(float(loss_value))) if loss_value < self.prev_loss and save_model_flag: self.save_model(epoch, loss_value) def eval_model(self): self.test() def train_model(self): for epoch in range(self.starting_epoch, self.trainer_params.epochs + 1): self.train(epoch) self.test(epoch=epoch, save_model_flag=True) if epoch%20==0: self.optimizer.param_groups[0]['lr'] = self.optimizer.param_groups[0]['lr']*0.98 def export_model(self, simd_factor, pe): self.model.eval() self.model.export('r_model_fw_bw.hpp', simd_factor, pe) def export_image(self, idx=100): img, label = self.val_data.images[:,idx,:], self.val_data.labels[0][idx] img = img.transpose(1, 0) label -= 1 label = self.decoder.to_string(label) from PIL import Image from matplotlib import cm im = Image.fromarray(np.uint8(cm.gist_earth(img)*255)) im.save('test_image.png') img = img.transpose(1, 0) img = np.reshape(img, (-1, 1)) np.savetxt("test_image.txt", img, fmt='%.10f') f = open('test_image_gt.txt','w') f.write(label) f.close() print("Exported image with label = {}".format(label))
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) train, dev, test, vocab = torch.load(open(options.data_file, 'rb'), pickle_module=dill) batched_train, batched_train_mask, _ = utils.tensor.advanced_batchize( train, options.batch_size, vocab.stoi["<pad>"]) batched_dev, batched_dev_mask, _ = utils.tensor.advanced_batchize( dev, options.batch_size, vocab.stoi["<pad>"]) vocab_size = len(vocab) if options.load_file: rnnlm = torch.load(options.load_file) else: rnnlm = BiLSTM(vocab_size) if use_cuda > 0: rnnlm.cuda() else: rnnlm.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(rnnlm.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") rnnlm.train() for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train))): train_batch = Variable( batched_train[batch_i]) # of size (seq_len, batch_size) train_mask = Variable(batched_train_mask[batch_i]) if use_cuda: train_batch = train_batch.cuda() train_mask = train_mask.cuda() sys_out_batch = rnnlm( train_batch ) # (seq_len, batch_size, vocab_size) # TODO: substitute this with your module train_in_mask = train_mask.view(-1) train_in_mask = train_in_mask.unsqueeze(1).expand( len(train_in_mask), vocab_size) train_out_mask = train_mask.view(-1) sys_out_batch = sys_out_batch.view(-1, vocab_size) train_out_batch = train_batch.view(-1) sys_out_batch = sys_out_batch.masked_select(train_in_mask).view( -1, vocab_size) train_out_batch = train_out_batch.masked_select(train_out_mask) loss = criterion(sys_out_batch, train_out_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 rnnlm.eval() for batch_i in range(len(batched_dev)): dev_batch = Variable(batched_dev[batch_i], volatile=True) dev_mask = Variable(batched_dev_mask[batch_i], volatile=True) if use_cuda: dev_batch = dev_batch.cuda() dev_mask = dev_mask.cuda() sys_out_batch = rnnlm(dev_batch) dev_in_mask = dev_mask.view(-1) dev_in_mask = dev_in_mask.unsqueeze(1).expand( len(dev_in_mask), vocab_size) dev_out_mask = dev_mask.view(-1) sys_out_batch = sys_out_batch.view(-1, vocab_size) dev_out_batch = dev_batch.view(-1) sys_out_batch = sys_out_batch.masked_select(dev_in_mask).view( -1, vocab_size) dev_out_batch = dev_out_batch.masked_select(dev_out_mask) loss = criterion(sys_out_batch, dev_out_batch) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) #if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save( rnnlm, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
"./data/wsj0_train_merged_labels.npy", batch_size=batch_size, shuffle=True) test_dataloader = create_dataloader("./data/wsj0_test", None, batch_size=batch_size, test=True, shuffle=False) model = BiLSTM(40, 256, 47, 5, use_gpu=True) # model = Model(40, 47, 256) if checkpoint: model.load_state_dict(torch.load(checkpoint)) model = model.cuda() ctc_loss = nn.CTCLoss() def criterion(out, label, data_len, label_len): loss = ctc_loss(out, label, data_len, label_len) reg_loss = 0 for param in model.parameters(): reg_loss += (param**2).sum() factor = 0.00001 loss += factor * reg_loss return loss optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=5e-5) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 30, 40, 50, 60, 70, 80], gamma=0.5)