def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs) self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout']) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params \ + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params])
def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.lr_in = kwargs.get('n_out', dec_nhids) self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) # src_nhids*2 corresponds the last dimension of encoded state self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids * 2, **kwargs) # the output size of decoder should be same with lr_in if no n_out # defined self.logistic = LogisticRegression(self.lr_in, trg_vocab_size, prefix='logistic', **kwargs) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \ self.encoder.params + self.decoder.params + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params]) self.use_mv = kwargs.get('use_mv', 0)
class Predictor(): def __init__(self): try: f = open("params.json", "r", encoding='utf8') self.params = json.loads(f.read()) except FileNotFoundError: self.params = save_corpus() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.device = device self.model = BiGRU(torch.zeros((len(self.params['tbl']) + 1, 300)), MODEL_PARAMS['gru_hidden_dim'], MODEL_PARAMS['gru_num_layers'], len(self.params['tagset']), MODEL_PARAMS['concat']).to(device) self.model.load_state_dict( torch.load('trained_model.pt', map_location=lambda storage, loc: storage)) self.model.eval() def predict(self, sentence): words = sentence.split() lis = [] new_words = [] for word in words: symbol = None if not word[-1].isalnum(): symbol = word[-1] word = word[:-1] if word.lower() in self.params['tbl']: lis.append(self.params['tbl'][word.lower()]) else: lis.append(0) new_words.append(word) if symbol != None: if symbol in self.params['tbl']: lis.append(self.params['tbl'][symbol]) else: lis.append(0) new_words.append(symbol) x = torch.LongTensor(lis).to(self.device) x = x.unsqueeze(0) y_raw = self.model(x) y_pred = torch.argmax(y_raw, dim=2).view(-1) tagged_sent = '' for i in range(len(y_pred)): tagged_sent += new_words[i] tagged_sent += ' ' tagged_sent += self.params['reverse_tagset'][y_pred[i]] tagged_sent += ' ' print(tagged_sent) def tag_lookup(self, tag): try: print('TAG:', tag) print('Definition:', self.params['tag_definition'][tag][0]) except: print('Error: Tag not found.')
def train(): glove_pretrained, dataloaders, dataset_sizes, tbl, tagset, reverse_tagset, tag_definitions = preprocess( ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = BiGRU(glove_pretrained, MODEL_PARAMS['gru_hidden_dim'], MODEL_PARAMS['gru_num_layers'], len(tagset), MODEL_PARAMS['concat']).to(device) criterion = nn.NLLLoss(ignore_index=-1) optimizer = optim.Adam(net.parameters(), lr=0.001) train_model(device, net, dataloaders, dataset_sizes, criterion, optimizer, MODEL_PARAMS['num_epochs']) test(device, net, dataloaders['testing']) torch.save(net.state_dict(), 'trained_model.pt')
def __init__(self): try: f = open("params.json", "r", encoding='utf8') self.params = json.loads(f.read()) except FileNotFoundError: self.params = save_corpus() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.device = device self.model = BiGRU(torch.zeros((len(self.params['tbl']) + 1, 300)), MODEL_PARAMS['gru_hidden_dim'], MODEL_PARAMS['gru_num_layers'], len(self.params['tagset']), MODEL_PARAMS['concat']).to(device) self.model.load_state_dict( torch.load('trained_model.pt', map_location=lambda storage, loc: storage)) self.model.eval()
def model_load_test(test_df, vocab_file, embeddings_file, pretrained_file, test_prediction_dir, test_prediction_name, mode, num_labels=2, max_length=50, gpu_index=0, batch_size=128): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing BiGRU model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, predictions = test(model, test_loader) print( "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n" .format(batch_time, total_time, (accuracy * 100))) test_prediction = pd.DataFrame({'prediction': predictions}) if not os.path.exists(test_prediction_dir): os.makedirs(test_prediction_dir) test_prediction.to_csv(os.path.join(test_prediction_dir, test_prediction_name), index=False)
class Translate(object): def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs) self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout']) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params \ + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params]) def apply(self, source, source_mask, target, target_mask, **kwargs): sbelow = self.src_lookup_table.apply(source) tbelow = self.trg_lookup_table.apply_zero_pad(target) s_rep = self.encoder.apply(sbelow, source_mask) hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask) cost_matrix = self.logistic.cost(hiddens, target, target_mask) self.cost = cost_matrix.sum()/target_mask.shape[1] def _next_prob_state(self, y, state, c, c_x): next_state, merge_out = self.decoder.next_state_merge(y, state, c, c_x) prob = self.logistic.apply(merge_out) return prob, next_state def build_sample(self): x = T.matrix('x', dtype='int64') sbelow = self.src_lookup_table.apply(x) ctx = self.encoder.apply(sbelow, mask=None) c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs init_state = self.decoder.init_state(ctx) outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init') y = T.vector('y_sampler', dtype='int64') y_emb = self.trg_lookup_table.index(y) init_state = T.matrix('init_state', dtype='float32') next_probs, next_state = self._next_prob_state(y_emb, init_state, ctx, c_x) inps = [y, ctx, init_state] outs = [next_probs, next_state] f_next = theano.function(inps, outs, name='f_next') return f_init, f_next def savez(self, filename): params_value = OrderedDict([(kk, value.get_value()) for kk, value in self.tparams.iteritems()]) numpy.savez(filename, **params_value) def load(self, filename): params_value = numpy.load(filename) assert len(params_value.files) == len(self.tparams) for key, value in self.tparams.iteritems(): value.set_value(params_value[key])
# print("Nonzeros in masks", ones) labels = labels.type(torch.LongTensor) outputs = rnn(features) _, predicted = torch.max(outputs, 1) matched = torch.eq(predicted.data, labels) matched = torch.mul(matched.type(torch.FloatTensor), masks) correct += matched.sum() total += masks.sum() # print("correct prediction: {}; total prediction: {}".format(correct, total)) # print("sum comparison:", masks.sum(), seq_len.sum()) accuracy = correct / total return accuracy rnn = BiGRU(input_size, hidden_size, num_layers, num_classes, batch_size) rnn = rnn.cuda() if use_cuda else rnn print("Model loaded!!!!!!!!") criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) for epoch in range(num_epochs): print("Epoch {} is running".format(epoch + 1)) for i, (name, features, masks, labels, seq_len) in enumerate(train_loader): features = Variable(features) labels = labels.view(-1) labels = Variable(labels.type(torch.LongTensor)) # print("labels size: {}".format(labels.size())) if use_cuda:
def train_model(args, train_text=None, train_labels=None, eval_text=None, eval_labels=None, tokenizer=None): textattack.shared.utils.set_seed(args.random_seed) _make_directories(args.output_dir) num_gpus = torch.cuda.device_count() # Save logger writes to file log_txt_path = os.path.join(args.output_dir, "log.txt") fh = logging.FileHandler(log_txt_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info(f"Writing logs to {log_txt_path}.") train_examples_len = len(train_text) # label_id_len = len(train_labels) label_set = set(train_labels) args.num_labels = len(label_set) logger.info( f"Loaded dataset. Found: {args.num_labels} labels: {sorted(label_set)}" ) if len(train_labels) != len(train_text): raise ValueError( f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})" ) if len(eval_labels) != len(eval_text): raise ValueError( f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})" ) if args.model == "gru": textattack.shared.logger.info( "Loading textattack model: GRUForClassification") model = BiGRU() model.to(device) elif args.model == "lstm": textattack.shared.logger.info( "Loading textattack model: LSTMForClassification") model = BiLSTM() model.to(device) # attack_class = attack_from_args(args) # We are adversarial training if the user specified an attack along with # the training args. # adversarial_training = (attack_class is not None) and (not args.check_robustness) # multi-gpu training if num_gpus > 1: model = torch.nn.DataParallel(model) logger.info("Using torch.nn.DataParallel.") logger.info(f"Training model across {num_gpus} GPUs") num_train_optimization_steps = ( int(train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs) if args.model == "lstm" or args.model == "cnn" or args.model == "gru": def need_grad(x): return x.requires_grad optimizer = torch.optim.Adam(filter(need_grad, model.parameters()), lr=args.learning_rate) scheduler = None else: param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = transformers.optimization.AdamW( optimizer_grouped_parameters, lr=args.learning_rate) scheduler = transformers.optimization.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps, ) # Start Tensorboard and log hyperparams. from torch.utils.tensorboard import SummaryWriter tb_writer = SummaryWriter(args.output_dir) # Use Weights & Biases, if enabled. if args.enable_wandb: global wandb wandb = textattack.shared.utils.LazyLoader("wandb", globals(), "wandb") wandb.init(sync_tensorboard=True) # Save original args to file args_save_path = os.path.join(args.output_dir, "train_args.json") _save_args(args, args_save_path) logger.info(f"Wrote original training args to {args_save_path}.") tb_writer.add_hparams( {k: v for k, v in vars(args).items() if _is_writable_type(v)}, {}) # Start training logger.info("***** Running training *****") # if augmenter: # logger.info(f"\tNum original examples = {train_examples_len}") # logger.info(f"\tNum examples after augmentation = {len(train_text)}") # else: # logger.info(f"\tNum examples = {train_examples_len}") logger.info(f"\tNum examples = {train_examples_len}") logger.info(f"\tBatch size = {args.batch_size}") logger.info(f"\tMax sequence length = {args.max_length}") logger.info(f"\tNum steps = {num_train_optimization_steps}") logger.info(f"\tNum epochs = {args.num_train_epochs}") logger.info(f"\tLearning rate = {args.learning_rate}") eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels, args.batch_size) train_dataloader = _make_dataloader(tokenizer, train_text, train_labels, args.batch_size) global_step = 0 tr_loss = 0 model.train() args.best_eval_score = 0 args.best_eval_score_epoch = 0 args.epochs_since_best_eval_score = 0 def loss_backward(loss): if num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() return loss # if args.do_regression: # # TODO integrate with textattack `metrics` package # loss_fct = torch.nn.MSELoss() # else: # loss_fct = torch.nn.CrossEntropyLoss() loss_fct = torch.nn.CrossEntropyLoss() for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch", position=0, leave=True): # if adversarial_training: # if epoch >= args.num_clean_epochs: # if (epoch - args.num_clean_epochs) % args.attack_period == 0: # # only generate a new adversarial training set every args.attack_period epochs # # after the clean epochs # logger.info("Attacking model to generate new training set...") # adv_attack_results = _generate_adversarial_examples( # model_wrapper, attack_class, list(zip(train_text, train_labels)) # ) # adv_train_text = [r.perturbed_text() for r in adv_attack_results] # train_dataloader = _make_dataloader( # tokenizer, adv_train_text, train_labels, args.batch_size # ) # else: # logger.info(f"Running clean epoch {epoch+1}/{args.num_clean_epochs}") prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration", position=0, leave=True) # Use these variables to track training accuracy during classification. correct_predictions = 0 total_predictions = 0 for step, batch in enumerate(prog_bar): ids1, ids2, msk1, msk2, labels = batch # input_ids, labels = batch labels = labels.to(device) # if isinstance(input_ids, dict): # ## dataloader collates dict backwards. This is a workaround to get # # ids in the right shape for HuggingFace models # input_ids = { # k: torch.stack(v).T.to(device) for k, v in input_ids.items() # } # logits = model(**input_ids)[0] # else: ids1 = ids1.to(device) ids2 = ids2.to(device) msk1 = msk1.to(device) msk2 = msk2.to(device) logits = model(ids1, ids2, msk1, msk2) # if args.do_regression: # # TODO integrate with textattack `metrics` package # loss = loss_fct(logits.squeeze(), labels.squeeze()) # else: loss = loss_fct(logits, labels) pred_labels = logits.argmax(dim=-1) correct_predictions += (pred_labels == labels).sum().item() total_predictions += len(pred_labels) loss = loss_backward(loss) tr_loss += loss.item() if global_step % args.tb_writer_step == 0: tb_writer.add_scalar("loss", loss.item(), global_step) if scheduler is not None: tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) else: tb_writer.add_scalar("lr", args.learning_rate, global_step) if global_step > 0: prog_bar.set_description(f"Loss {tr_loss/global_step}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() if scheduler is not None: scheduler.step() optimizer.zero_grad() # Save model checkpoint to file. if (global_step > 0 and (args.checkpoint_steps > 0) and (global_step % args.checkpoint_steps) == 0): _save_model_checkpoint(model, args.output_dir, global_step) # Inc step counter. global_step += 1 # Print training accuracy, if we're tracking it. if total_predictions > 0: train_acc = correct_predictions / total_predictions logger.info(f"Train accuracy: {train_acc*100}%") tb_writer.add_scalar("epoch_train_score", train_acc, epoch) # Check accuracy after each epoch. # skip args.num_clean_epochs during adversarial training # if (not adversarial_training) or (epoch >= args.num_clean_epochs): if (epoch >= args.num_clean_epochs): eval_score = _get_eval_score(model, eval_dataloader, False) tb_writer.add_scalar("epoch_eval_score", eval_score, epoch) if args.checkpoint_every_epoch: _save_model_checkpoint(model, args.output_dir, args.global_step) logger.info( f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if eval_score > args.best_eval_score: args.best_eval_score = eval_score args.best_eval_score_epoch = epoch args.epochs_since_best_eval_score = 0 _save_model(model, args.output_dir, args.weights_name, args.config_name) logger.info( f"Best acc found. Saved model to {args.output_dir}.") _save_args(args, args_save_path) logger.info(f"Saved updated args to {args_save_path}") else: args.epochs_since_best_eval_score += 1 if (args.early_stopping_epochs > 0) and (args.epochs_since_best_eval_score > args.early_stopping_epochs): logger.info( f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased" ) break if args.check_robustness: samples_to_attack = list(zip(eval_text, eval_labels)) samples_to_attack = random.sample(samples_to_attack, 1000) adv_attack_results = _generate_adversarial_examples( model_wrapper, attack_class, samples_to_attack) attack_types = [r.__class__.__name__ for r in adv_attack_results] attack_types = collections.Counter(attack_types) adv_acc = 1 - (attack_types["SkippedAttackResult"] / len(adv_attack_results)) total_attacks = (attack_types["SuccessfulAttackResult"] + attack_types["FailedAttackResult"]) adv_succ_rate = attack_types[ "SuccessfulAttackResult"] / total_attacks after_attack_acc = attack_types["FailedAttackResult"] / len( adv_attack_results) tb_writer.add_scalar("robustness_test_acc", adv_acc, global_step) tb_writer.add_scalar("robustness_total_attacks", total_attacks, global_step) tb_writer.add_scalar("robustness_attack_succ_rate", adv_succ_rate, global_step) tb_writer.add_scalar("robustness_after_attack_acc", after_attack_acc, global_step) logger.info(f"Eval after-attack accuracy: {100*after_attack_acc}%") # read the saved model and report its eval performance logger.info( "Finished training. Re-loading and evaluating model from disk.") model_wrapper = model_from_args(args, args.num_labels) model = model_wrapper.model model.load_state_dict( torch.load(os.path.join(args.output_dir, args.weights_name))) eval_score = _get_eval_score(model, eval_dataloader, args.do_regression) logger.info( f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if args.save_last: _save_model(model, args.output_dir, args.weights_name, args.config_name) # end of training, save tokenizer try: tokenizer.save_pretrained(args.output_dir) logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.") except AttributeError: logger.warn( f"Error: could not save tokenizer {tokenizer} to {args.output_dir}." ) # Save a little readme with model info write_readme(args, args.best_eval_score, args.best_eval_score_epoch) _save_args(args, args_save_path) tb_writer.close() logger.info(f"Wrote final training args to {args_save_path}.")
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) train, dev, test, vocab = torch.load(open(options.data_file, 'rb'), pickle_module=dill) batched_train, batched_train_mask, _ = utils.tensor.advanced_batchize( train, options.batch_size, vocab.stoi["<pad>"]) batched_dev, batched_dev_mask, _ = utils.tensor.advanced_batchize( dev, options.batch_size, vocab.stoi["<pad>"]) vocab_size = len(vocab) rnnlm = BiGRU(vocab_size, use_cuda=use_cuda) if use_cuda > 0: rnnlm.cuda() else: rnnlm.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(rnnlm.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") rnnlm.train() for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train))): train_batch = Variable( batched_train[batch_i]) # of size (seq_len, batch_size) train_mask = Variable(batched_train_mask[batch_i]) train_in_mask = train_mask.view(-1) train_out_mask = train_mask.view(-1) if use_cuda: train_batch = train_batch.cuda() train_mask = train_mask.cuda() train_in_mask = train_in_mask.cuda() train_out_mask = train_out_mask.cuda() sys_out_batch = rnnlm( train_batch ) # (seq_len, batch_size, vocab_size) # TODO: substitute this with your module train_in_mask = train_in_mask.unsqueeze(1).expand( len(train_in_mask), vocab_size) sys_out_batch = sys_out_batch.view(-1, vocab_size) train_out_batch = train_batch.view(-1) sys_out_batch = sys_out_batch.masked_select(train_in_mask).view( -1, vocab_size) train_out_batch = train_out_batch.masked_select(train_out_mask) loss = criterion(sys_out_batch, train_out_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 rnnlm.eval() for batch_i in range(len(batched_dev)): dev_batch = Variable(batched_dev[batch_i], volatile=True) dev_mask = Variable(batched_dev_mask[batch_i], volatile=True) dev_in_mask = dev_mask.view(-1) dev_out_batch = dev_batch.view(-1) if use_cuda: dev_batch = dev_batch.cuda() dev_mask = dev_mask.cuda() dev_in_mask = dev_in_mask.cuda() dev_out_batch = dev_out_batch.cuda() sys_out_batch = rnnlm(dev_batch) dev_in_mask = dev_in_mask.unsqueeze(1).expand( len(dev_in_mask), vocab_size) dev_out_mask = dev_mask.view(-1) sys_out_batch = sys_out_batch.view(-1, vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_in_mask).view( -1, vocab_size) dev_out_batch = dev_out_batch.masked_select(dev_out_mask) loss = criterion(sys_out_batch, dev_out_batch) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: logging.info( "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})" .format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) break torch.save( rnnlm, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
class Translate(object): def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.lr_in = kwargs.get('n_out', dec_nhids) self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) # src_nhids*2 corresponds the last dimension of encoded state self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids * 2, **kwargs) # the output size of decoder should be same with lr_in if no n_out # defined self.logistic = LogisticRegression(self.lr_in, trg_vocab_size, prefix='logistic', **kwargs) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \ self.encoder.params + self.decoder.params + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params]) self.use_mv = kwargs.get('use_mv', 0) def apply(self, source, source_mask, target, target_mask, v_part=None, v_true=None, **kwargs): # sbelow and tbelow are 3-D matrix, sbelow[i][j] (tbelow[i][j]) are embeddings of the i^{th} word in the j^{th} sentence in batch # here, source and source_mask: shape(src_sent_len * batch_size) # target and target_mask: shape(trg_sent_len * batch_size) # and their type are all theano.tensor.var.TensorVariable (numpy.ndarray) # (40,28,620) = (src_sent_len, batch_size, srcw_embsz) sbelow = self.src_lookup_table.apply(source) # the shape is different from source, (trg_sent_len-1, batch_size, # trgw_embsz) tbelow = self.trg_lookup_table.apply_zero_pad(target) # (src_sent_len, batch_size, src_nhids*2): bidirectional encode source sentence s_rep = self.encoder.apply(sbelow, source_mask) # remove the last word which is '</S>' of each sentence in a batch, the padding words are alse </S> 29999 # tbelow[:-1] -> shape(trg_sent_len-1, batch_size, trgw_embsz) # target_mask[:-1] -> shape(trg_sent_len-1, batch_size) # hiddens, s, a, ss, als = self.decoder.apply(tbelow[:-1], target_mask[:-1], s_rep, source_mask) hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask) # hiddens from decoder: shape(trg_sent_len-1, batch_size, n_out) # (padding words all 0) self.mean_cost, self.mean_abs_log_norm = self.logistic.cost( hiddens, target, target_mask, v_part, v_true) # cost_matrix: shape((trg_sent_len-1), batch_size), here the trg_sent_len corresponds to this batch, # trg_sent_len may differ between different batches # cost_matrix.sum(): sum of all the elements in cost_matrix # target_mask[1]: the sentences number in a batch # so, cost_matrix.sum()/target_mask.shape[1] is actually the average cross # entropy per sentence in a batch ''' y_emb_im1: (trgw_embsz,) t_stat_im1: (batch_size, trg_nhids) ctx: (src_sent_len, batch_size, src_nhids*2) c_x: (src_sent_len, batch_size, trg_nhids) ''' def build_sample(self): x = T.matrix('x', dtype='int64') sbelow = self.src_lookup_table.apply(x) mask = T.alloc(numpy.float32(1.), sbelow.shape[0], sbelow.shape[1]) # (src_sent_len, batch_size, src_nhids*2) batch_size == 1 for decoding ctx = self.encoder.apply(sbelow, mask) # self.decoder.Ws: (src_nhids*2, trg_nhids) # self.decocer.bs: (trg_nhids, ) # (src_sent_len, batch_size, trg_nhids) (-1 ~ 1) # as long as ctx is inputed as parameter, no need c_x, it will be # calculated... do not worry c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs # init_state: (batch_size, trg_nhids) init_state = self.decoder.init_state( ctx) # no mask here, because no batch f_init = theano.function([x], [init_state, ctx, c_x], name='f_init') #-------------------------------------------------------------- y_im1 = T.vector('y_sampler', dtype='int64') y_emb_im1 = self.trg_lookup_table.index(y_im1) f_emb = theano.function([y_im1], y_emb_im1, name='f_emb') #t_yemb_im1 = T.tensor3('t_yemb_im1', dtype='float32') t_yemb_im1 = T.matrix('t_yemb_im1', dtype='float32') t_stat_im1 = T.matrix('t_stat_im1', dtype='float32') #-------------------------------------------------------------- # get next state h1: h_i = rnn(y_{i-1}, s_{i-1}) # y_emb_im1: embedding of one target word, shape(1, trgw_embsz) hi = self.decoder._step_forward(x_t=t_yemb_im1, x_m=None, h_tm1=t_stat_im1) f_nh = theano.function([t_yemb_im1, t_stat_im1], hi, name='f_nh') #-------------------------------------------------------------- t_hi = T.matrix('t_hi', dtype='float32') t_ctx = T.tensor3('t_ctx', dtype='float32') t_c_x = T.tensor3('t_c_x', dtype='float32') # next attention: a_i = a(h_i, c_i), c_i is actually do not change ... pi, ai = self.decoder.attention_layer.apply(source_ctx=t_ctx, source_mask=None, source_x=t_c_x, cur_hidden=t_hi) f_na = theano.function([t_ctx, t_c_x, t_hi], [pi, ai], name='f_na') #-------------------------------------------------------------- # get next final state, s_i = f(h_i<=(y_{i-1} and s_{i-1}), y_{i-1}, # c_i) t_ai = T.matrix('t_ai', dtype='float32') ns = self.decoder.state_with_attend(h1=t_hi, attended=t_ai) f_ns = theano.function([t_hi, t_ai], ns, name='f_ns') #-------------------------------------------------------------- # merge_out = g(y_{i-1}, s_i, a_i) t_si = T.matrix('t_si', dtype='float32') merge_out = self.decoder.merge_out(y_emb_im1=t_yemb_im1, s_i=t_si, a_i=t_ai) f_mo = theano.function([t_yemb_im1, t_ai, t_si], merge_out, name='f_mo') #-------------------------------------------------------------- # get model score of the whole vocab: nonlinear(merge_out) t_mo = T.matrix('t_mo', dtype='float32') if self.use_mv: ptv = T.vector('ptv', dtype='int64') ptv_ins = [t_mo, ptv] ptv_ous = self.logistic.apply_score(t_mo, ptv, drop=True) else: ptv_ins = [t_mo] ptv_ous = self.logistic.apply_score(t_mo, drop=True) f_pws = theano.function(ptv_ins, ptv_ous, name='f_pws') #-------------------------------------------------------------- # no need to use the whole vocabulary, vocabulary manipulation # if use T.ivector(), this slice will be very slow on cpu, i do not # know why y = T.wscalar('y') # get part model score slice: nonlinear(merge_out)[part] f_one = theano.function([t_mo, y], self.logistic.apply_score_one(t_mo, y), name='f_one') #-------------------------------------------------------------- # distribution over target vocab: softmax(energy) t_pws = T.matrix('t_pws', dtype='float32') #self.logistic.apply_softmax(t_pws) #self.logistic.softmax(t_pws) f_ce = theano.function([t_pws], T.nnet.softmax(t_pws), name='f_ce') # next_w(y_emb_im1): (k-dead_k,) the last word id of each translate candidate in beam # ctx: (src_sent_len, live_k, src_nhids*2) # t_stat_im1: shape(k-dead_k, trg_nhids) # probs: shape(k-dead_k, trg_vocab_size) # f_next ................. next_probs, next_state = self.next_prob_state(y_emb_im1, t_stat_im1, ctx, c_x) inps = [y_im1, ctx, t_stat_im1] outs = [next_probs, next_state] f_next = theano.function(inps, outs, name='f_next') return [ f_init, f_nh, f_na, f_ns, f_mo, f_pws, f_one, f_ce, f_next, f_emb ] def next_prob_state(self, y_emb_im1, s_im1, ctx, c_x): next_state, merge_out = self.decoder.next_state_mout( y_emb_im1, s_im1, ctx, c_x) prob = self.logistic.apply(merge_out) return prob, next_state def savez(self, filename): params_value = OrderedDict([(kk, value.get_value()) for kk, value in self.tparams.iteritems()]) numpy.savez(filename, **params_value) def load(self, filename): # change all weights by file params_value = numpy.load(filename) assert len(params_value.files) == len(self.tparams) for key, value in self.tparams.iteritems(): # type(value) theano.tensor.sharedvar.TensorSharedVariable # params_value[key] is numpy.ndarray # we set the shared variable as the numpy array value.set_value(params_value[key]) ''' type(params_value['logistic_W0']: numpy.ndarray (512, 30000) array([[-0.00096034, -0.0392303 , -0.07458289, ..., -0.00285031, 0.03942127, -0.03161906], [-0.03706803, -0.06445373, -0.00836279, ..., -0.01915432, -0.00247126, 0.17407075], [-0.00102945, 0.03983303, -0.00801838, ..., -0.02834764, 0.02834882, -0.07769781], ..., [ 0.01267207, 0.07802714, -0.02748013, ..., 0.0485581 , -0.00657458, 0.07204553], [ 0.01089897, 0.06406539, -0.04804269, ..., -0.03247456, 0.04343275, -0.14596273], [ 0.01474529, 0.02925147, 0.01569422, ..., 0.01673588, -0.02202134, 0.19972666]], dtype=float32) ''' def load2numpy(self, filename): # change all weights by file params_value = numpy.load(filename) assert len(params_value.files) == len(self.tparams) return params_value
def model_train_validate_test(train_df, dev_df, test_df, embeddings_file, vocab_file, target_dir, mode, num_labels=2, max_length=50, epochs=50, batch_size=128, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, if_save_model=False, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = My_Dataset(train_df, vocab_file, max_length, mode) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = My_Dataset(dev_df, vocab_file, max_length, mode) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") if (embeddings_file is not None): embeddings = load_embeddings(embeddings_file) else: embeddings = None model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device) total_params = sum(p.numel() for p in model.parameters()) print(f'{total_params:,} total parameters.') total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'{total_trainable_params:,} training parameters.') # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training BiGRU model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, _, = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 if (if_save_model): torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) print("save model succesfully!\n") print("* Test for epoch {}:".format(epoch)) _, _, test_accuracy, predictions = validate( model, test_loader, criterion) print("Test accuracy: {:.4f}%\n".format(test_accuracy)) test_prediction = pd.DataFrame({'prediction': predictions}) test_prediction.to_csv(os.path.join(target_dir, "test_prediction.csv"), index=False) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break