def __init__(self, hparams=DotDict({ 'model_type': 'transformer', 'ninp': 128, 'nhead': 2, 'nhid': 512, 'nlayers': 2, 'tie_layers': True, 'tie_encoder_decoder': True, 'dropout': 0.1, })): super(LanguageModelTrainer, self).__init__() self.hparams = hparams if isinstance(hparams, DotDict) \ else DotDict(hparams) from utils import get_default_tokenizer self.vocab_size = get_default_tokenizer()._tokenizer.get_vocab_size() self.model_type = hparams.get('model_type', 'transformer') assert self.model_type in ['transformer', 'lstm'] if self.model_type == 'transformer': self.model = TransformerModel(ntoken=self.vocab_size, **hparams) else: self.model = LSTMModel(ntoken=self.vocab_size, **hparams) self.batch_size = hparams.get('batch_size', 64) self.bptt = hparams.get('bptt', 128)
def run(stock: str, model_type: str, stationary=True): df = Analysis.get_data(stock) df["Company stock name"] = stock.split('/')[-1].split('.')[0] dataset = GetDataset(df) dataset.get_dataset(scale=False, stationary=stationary) train_data, test_data, train_data_len = dataset.split(train_split_ratio=0.8, time_period=30) train_data, test_data = dataset.get_torchdata() x_train, y_train = train_data x_test, y_test = test_data if model_type == 'lstm': params = rnn_params model = TorchRNN(rnn_type=params.rnn_type, input_dim=params.input_dim, hidden_dim=params.hidden_dim, output_dim=params.output_dim, num_layers=params.num_layers) elif model_type == 'transformer': params = transf_params model = TransformerModel(params) else: raise ValueError('Wrong model type selection, select either "rnn" or "transformer"!') clf = Classifier(model) clf.train([x_train, y_train], params=params) y_scaler = dataset.y_scaler predictions = clf.predict([x_test, y_test], y_scaler, data_scaled=False) predictions = pd.DataFrame(predictions) predictions.reset_index(drop=True, inplace=True) predictions.index = df.index[-len(x_test):] predictions['Actual'] = y_test[:-1] predictions.rename(columns={0: 'Predictions'}, inplace=True) if stationary: predictions = Analysis.inverse_stationary_data(old_df=df, new_df=predictions, orig_feature='Actual', new_feature='Predictions', diff=12, do_orig=False) plot_predictions(df, train_data_len, predictions["Predictions"].values, model_type)
def __init__(self, hparams: dict(), **kwargs) -> 'LightningTemplateModel': # init superclass super().__init__(**kwargs) self.save_hyperparameters() self.hparams = hparams if self.hparams.model == 'awd': self.model = WDLSTM( self.hparams.num_tokens, num_layers=self.hparams.num_layers, num_hidden=self.hparams.num_hidden, num_embedding=self.hparams.num_embedding, tie_weights=self.hparams.tie_weights, embedding_dropout=self.hparams.embedding_dropout, input_dropout=self.hparams.input_dropout, hidden_dropout=self.hparams.hidden_dropout, output_dropout=self.hparams.output_dropout, weight_dropout=self.hparams.weight_dropout) self.model( torch.zeros(self.hparams.bptt, self.hparams.batch_size).long(), self.model.init_hidden(self.hparams.batch_size)) elif self.hparams.model == 'rnn': self.model = RNNModel(self.hparams.rnn_type, self.hparams.num_tokens, num_embedding=self.hparams.num_embedding, num_hidden=self.hparams.num_hidden, num_layers=self.hparams.num_layers, dropout=self.hparams.dropout, tie_weights=self.hparams.tie_weights) elif self.hparams.model == 'transformer': self.model = TransformerModel( self.hparams.num_tokens, num_embedding=self.hparams.num_embedding, num_hidden=self.hparams.num_hidden, num_layers=self.hparams.num_layers, dropout=self.hparams.dropout, num_heads=self.hparams.num_heads) else: raise ValueError(f'Model {self.hparams.model} not recognized.') self.hiddens = None self.criterion = torch.nn.NLLLoss() self.avg_loss = 0
def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ if self.config["model_name"] == "textcnn": self.model = TextCnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm": self.model = BiLstmModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm_atten": self.model = BiLstmAttenModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "rcnn": self.model = RcnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "transformer": self.model = TransformerModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors)
def initialize_model(type_model, args): if type_model.lower() == 'lstm': model = Stacked_LSTM(args) elif type_model.lower() == 'attention_lstm': model = LSTM_and_Attention(args) elif type_model.lower() == 'transformer': model = TransformerModel(d_input=3) elif type_model.lower() == 'cnn': model = CNN(n_out=36, dropout=0.01) else: raise ValueError("Invalid name!") loss_fn = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) return model, optimizer, loss_fn
def create_new_model(): if args.model_type == "seq2seq": return Seq2Seq(word_vectors=word_vectors, hidden_size=args.hidden_size, output_size=vocab_size, device=device) elif args.model_type == "seq2seq_attn": return Seq2SeqAttn(word_vectors=word_vectors, hidden_size=args.hidden_size, output_size=vocab_size, device=device) elif args.model_type == "transformer": return TransformerModel(vocab_size, device, num_encoder_layers=2, num_decoder_layers=2, dropout=0.1)
def __init__(self, train_dataset, test_dataset, *, val_dataset, n_layers=6, n_head=8, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, edrop=0.25, odrop=0.25, hdrop=0.1, propagate=False, steps=15, avg_window=AVERAGING_WINDOW, clip_grad=5, min_length=TRAIN_PERIODS, tf_decay=0.7**(1 / 6), tf_min=0.02, tf_warmup=12000, tf_steps=2000): self.name = "transformer" if propagate: self.name += "_tf" super(TransformerBot, self).__init__(train_dataset, test_dataset, clip_grad=clip_grad, val_dataset=val_dataset, avg_window=avg_window) self.model = TransformerModel(n_max_seq=TRAIN_PERIODS, n_layers=n_layers, n_head=n_head, d_word_vec=d_model, d_model=d_model, d_inner_hid=d_inner_hid, d_k=d_k, d_v=d_v, propagate=propagate, hdrop=hdrop, edrop=edrop, odrop=odrop, min_length=min_length, y_scale_by=1 / self.global_stds[0], steps=steps) self.model.cuda() self.current_tf_ratio = 1 self.best_tf_ratio = 1 self.tf_min = tf_min self.tf_decay = tf_decay self.tf_steps = tf_steps self.tf_warmup = tf_warmup self.logger.info(str(self.model)) if propagate: self.logger.info( "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}" .format(tf_min, tf_decay, tf_steps, tf_warmup)) self.tbwriter.add_text("model_structure", str(self.model)) self.tbwriter.add_text( "TF_setting", "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}". format(tf_min, tf_decay, tf_steps, tf_warmup))
cuda = config['train']['cuda'] #Main Loop while True: min_test_loss = 1.e6 loss = 0.0 train_loss_seq = [] test_loss_seq = [] if model_type == 'Transformer': model = TransformerModel(config) elif model_type == 'LSTM': model = LSTMModel(config) if cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=config['train']['learning_rate'], weight_decay=config['train']['weight_decay']) criterion = torch.nn.MSELoss() optimizer.zero_grad() for it in range(n_iter): model.train() country = random.choice(train_countries)
def launch(model_params, checkpoint_path, device='cuda'): print('model_params:\t', model_params) max_length = model_params['bptt'] tokenizer = get_default_tokenizer() eos_token = tokenizer.token_to_id('[SEP]') eod_token = tokenizer.token_to_id('[DOC_SEP]') vocab_size = tokenizer._tokenizer.get_vocab_size() assert eos_token is not None, 'Invalid tokenizer files - EOS token cannot be null' # Model from models import TransformerModel, LSTMModel model_type = model_params.get('model_type', 'transformer') assert model_type in ['transformer', 'lstm'] if model_type == 'transformer': model = TransformerModel(ntoken=vocab_size, **model_params) else: model = LSTMModel(ntoken=vocab_size, **model_params) model = model.to(device) if checkpoint_path and path.exists(checkpoint_path): print(f'Loading checkpoint from {checkpoint_path}') checkpoint_state = torch.load(checkpoint_path) model.load_state_dict(checkpoint_state) @torch.no_grad() def _generate(input_ids=None, max_length=max_length, do_sample=True, num_beams=5, temperature=1.3, top_k=50, top_p=1.0, repetition_penalty=1.2, eos_token_ids=[eos_token, eod_token], length_penalty=1.0, num_return_sequences=1, vocab_size=vocab_size): pad_token_id = 0 model.eval() batch_size = 1 cur_len = input_ids.shape[1] # Expand input to num beams input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size) ] # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states past = None # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: outputs = model(input_ids.t()) outputs = outputs.permute(1, 0, 2) # print(input_ids) # print(torch.argmax(outputs)) scores = outputs[:, -1, :] # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: for i in range(batch_size * num_beams): for previous_token in set(input_ids[i].tolist()): # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability if scores[i, previous_token] < 0: scores[i, previous_token] *= repetition_penalty else: scores[i, previous_token] /= repetition_penalty if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: scores = scores / temperature # Top-p/top-k filtering # min_value = torch.min(scores, dim=-1)[] scores = top_k_top_p_filtering( scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search) try: next_words = torch.multinomial( torch.softmax(scores, dim=-1), num_samples=2, replacement=True) # (batch_size * num_beams, 2) except: print((torch.softmax(scores, dim=-1) > 0).sum()) raise ValueError() # Compute next scores _scores = F.log_softmax( scores, dim=-1) # (batch_size * num_beams, vocab_size) _scores = torch.gather( _scores, -1, next_words) # (batch_size * num_beams, 2) next_scores = _scores + beam_scores[:, None].expand_as( _scores) # (batch_size * num_beams, 2) # Match shape of greedy beam search next_words = next_words.view( batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) next_scores = next_scores.view( batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) else: # do greedy beam search scores = F.log_softmax( scores, dim=-1) # (batch_size * num_beams, vocab_size) assert scores.size() == (batch_size * num_beams, vocab_size) # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) _scores = scores + beam_scores[:, None].expand_as( scores) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) _scores = _scores.view( batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams) # next batch beam content # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch) next_batch_beam = [] # for each sentence for batch_ex in range(batch_size): # if we are done with this sentence done[batch_ex] = done[batch_ex] or generated_hyps[ batch_ex].is_done(next_scores[batch_ex].max().item()) if done[batch_ex]: next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content next_sent_beam = [] # next words for this sentence for idx, score in zip(next_words[batch_ex], next_scores[batch_ex]): # get beam and word IDs beam_id = idx // vocab_size word_id = idx % vocab_size # end of sentence, or next word if word_id.item( ) in eos_token_ids or cur_len + 1 == max_length: generated_hyps[batch_ex].add( input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()) else: next_sent_beam.append( (score, word_id, batch_ex * num_beams + beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: break # update next beam content assert len(next_sent_beam ) == 0 if cur_len + 1 == max_length else num_beams if len(next_sent_beam) == 0: next_sent_beam = [(0, pad_token_id, 0) ] * num_beams # pad the batch next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_ex + 1) # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_words = input_ids.new([x[1] for x in next_batch_beam]) beam_idx = input_ids.new([x[2] for x in next_batch_beam]) # re-order batch input_ids = input_ids[beam_idx, :] input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1) # re-order internal states if past: reordered_past = [] for layer_past in past: # get the correct batch idx from layer past batch dim # batch dim of `past` and `mems` is at 2nd position reordered_layer_past = [ layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx ] reordered_layer_past = torch.cat(reordered_layer_past, dim=1) # check that shape matches assert reordered_layer_past.shape == layer_past.shape reordered_past.append(reordered_layer_past) past = tuple(reordered_past) # update current length cur_len = cur_len + 1 # stop when we are done with each sentence if all(done): break # visualize hypotheses # print([len(x) for x in generated_hyps], cur_len) # globals().update( locals() ); # !import code; code.interact(local=vars()) # for ii in range(batch_size): # for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True): # print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist())) # print("") # select the best hypotheses tgt_len = input_ids.new(batch_size) best = [] for i, hypotheses in enumerate(generated_hyps): if len(hypotheses.hyp) == 0: continue best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1] tgt_len[i] = len(best_hyp) + 1 # +1 for the <EOS> symbol best.append(best_hyp) # generate target batch decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id) for i, hypo in enumerate(best): decoded[i, :tgt_len[i] - 1] = hypo decoded[i, tgt_len[i] - 1] = eos_token_ids[0] return decoded model_input = LEADING_TEXT while True: user_prompt = input(' >>> ') if user_prompt == 'exit': exit() else: num_return_sequences = 1 model_input += ' [P0] ' + user_prompt + ' [SEP] [P1] ' input_ids = tokenizer.encode(model_input).ids input_ids = torch.LongTensor(input_ids).unsqueeze(0) input_ids = input_ids.to(device) output = _generate(input_ids=input_ids, max_length=min(max_length, input_ids.size(1) + 40)) if num_return_sequences != 1: output = output.view(batch_size, num_return_sequences, -1) response = tokenizer.decode(output[0].cpu().tolist(), skip_special_tokens=False) eod_token = '[DOC_SEP]' if eod_token in response: response = response[response.index(eod_token):] start_token = '[P1]' sep_token = '[SEP]' if start_token in response: start_idx = response.index(start_token) + len(start_token) + 1 response = response[start_idx:] if sep_token in response: sep_idx = response.index(sep_token) response = response[:sep_idx] model_input += response + f' {sep_token} ' print('Bot: ' + response)
class TransformerBot(BaseBot): def __init__(self, train_dataset, test_dataset, *, val_dataset, n_layers=6, n_head=8, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, edrop=0.25, odrop=0.25, hdrop=0.1, propagate=False, steps=15, avg_window=AVERAGING_WINDOW, clip_grad=5, min_length=TRAIN_PERIODS, tf_decay=0.7**(1 / 6), tf_min=0.02, tf_warmup=12000, tf_steps=2000): self.name = "transformer" if propagate: self.name += "_tf" super(TransformerBot, self).__init__(train_dataset, test_dataset, clip_grad=clip_grad, val_dataset=val_dataset, avg_window=avg_window) self.model = TransformerModel(n_max_seq=TRAIN_PERIODS, n_layers=n_layers, n_head=n_head, d_word_vec=d_model, d_model=d_model, d_inner_hid=d_inner_hid, d_k=d_k, d_v=d_v, propagate=propagate, hdrop=hdrop, edrop=edrop, odrop=odrop, min_length=min_length, y_scale_by=1 / self.global_stds[0], steps=steps) self.model.cuda() self.current_tf_ratio = 1 self.best_tf_ratio = 1 self.tf_min = tf_min self.tf_decay = tf_decay self.tf_steps = tf_steps self.tf_warmup = tf_warmup self.logger.info(str(self.model)) if propagate: self.logger.info( "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}" .format(tf_min, tf_decay, tf_steps, tf_warmup)) self.tbwriter.add_text("model_structure", str(self.model)) self.tbwriter.add_text( "TF_setting", "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}". format(tf_min, tf_decay, tf_steps, tf_warmup)) def get_model_params(self, steps=0, is_train=True): if is_train: if steps < self.tf_warmup: return {"tf_ratio": 1} if (steps - self.tf_warmup) % self.tf_steps == 0: self.current_tf_ratio = max( self.current_tf_ratio * self.tf_decay, self.tf_min) return {"tf_ratio": self.current_tf_ratio} return {"tf_ratio": 0} def reset_params(self): self.current_tf_ratio = 1 self.best_tf_ratio = 1 def additional_logging(self, step): if self.model.propagate: self.logger.info("Current tf_ratio: {:.4f}".format( self.current_tf_ratio)) self.tbwriter.add_scalar("tf_ratio", self.current_tf_ratio, step) def save_state(self): self.best_tf_ratio = self.current_tf_ratio
sort_within_batch=False) valid_iterator = BucketIterator(valid, batch_size=batch_size, device=device, sort=False, sort_within_batch=False) # sumeval evaluator evaluator = SumEvaluator(metrics=metrics, stopwords=False, lang="en") # Transformer model model = TransformerModel( len(IN_TEXT.vocab), t_conf["model"]["params"]["d_model"], # emb_size len(OUT_TEXT.vocab), pretrained_vectors=None, nhead=t_conf["model"]["params"]["nhead"], num_encoder_layers=t_conf["model"]["params"]["num_encoder_layer"], num_decoder_layers=t_conf["model"]["params"]["num_decoder_layer"], dim_feedforward=t_conf["model"]["params"]["dim_feedforward"], dropout=t_conf["model"]["params"]["dropout"]).to(device) # Optimizer # General template to make an optimzier instance # e.g.,) # optimizer = optim.SGD(model.parameters(), # lr=0.1, # momentum=0.9, # nesterov=True) optimizer = eval("{}(model.parameters(), **{})".format( t_conf["training"]["optimizer"]["cls"], str(t_conf["training"]["optimizer"]["params"])))
def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) return data, target ntokens = len(TEXT.vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(TEXT.vocab.stoi) for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad()
def main(args): # Set up logging and devices startime = datetime.now() args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) time_log = args.log_time if time_log > 0: log.info(f'Start training at: {startime.strftime("%H:%M:%S")}') tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) model_type = args.model # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # check this #useCharEmbeddings = args.model == 'BiDAFplus' # Get embeddings log.info('Loading embeddings...') print(f'{args.word_emb_file}') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) if time_log > 0: log.info(f'Loaded embeddings: {(datetime.now()-startime).seconds}') # load_char_vectors # Get model log.info('Building model...') if model_type == 'BiDAFplus': # model = BiDAFplus(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, params=get_params(model_type, args.params)) elif model_type == 'BiDAFbase': model = BiDAFbase(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif model_type == "Transformer": model = TransformerModel(word_vectors=word_vectors, char_vectors=char_vectors, params=get_params(model_type, args.params)) elif model_type == 'BiDAF': model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, params=get_params(model_type, args.params)) model = nn.DataParallel(model, args.gpu_ids) if time_log > 0: log.info(f'Built model: {(datetime.now()-startime).seconds}') if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') if args.mode != 'quick_eval': train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) else: loaded_data = quick_eval_data_loader() train_loader = [loaded_data for _ in range(5)] dev_loader = [quick_eval_data_loader(dev=True)] train_dataset = train_loader dev_dataset = dev_loader log.info('Built dataset: {}:{}'.format(*divmod((datetime.now() - startime).seconds, 60))) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) if time_log > 0: traintime = datetime.now() total_iterations = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') if time_log > 0: epochtime = datetime.now() if args.mode != 'quick_eval': progress_len = len(train_loader.dataset) else: progress_len = len(train_loader) with torch.enable_grad(), \ tqdm(total=progress_len) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: #quick_eval_data_saver(cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids) ######### if time_log > 0: itertime = datetime.now() # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() if model_type == 'BiDAF' or model_type == "Transformer": cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs) # Forward elif model_type == 'BiDAFbase': log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() if time_log > 2: forwardtime = datetime.now() log.info('Forward time {}:{}'.format( *divmod((forwardtime - itertime).seconds, 60))) # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) if time_log > 2: backwardtime = datetime.now() log.info('Backward time {}:{}'.format( *divmod((backwardtime - forwardtime).seconds, 60))) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) if time_log > 0: enditertime = datetime.now() #log.info('Iteration {} {}:{}'.format(total_iterations, # *divmod((enditertime-itertime).seconds, 60))) steps_till_eval -= batch_size if steps_till_eval <= 0 or args.mode == 'quick_eval': steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate( model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, model_type, quick_eval=args.mode == 'quick_eval') saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console if time_log > 1: log.info('Eval time {}:{}'.format( *divmod((datetime.now() - enditertime).seconds, 60))) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) total_iterations += 1 if ((time_log == 2) and (total_iterations % 10 == 0)) or ( (time_log == 1) and (total_iterations % 100 == 0)): log.info('Mean iteration time {}:{}'.format( *divmod((enditertime - traintime).seconds / total_iterations, 60))) if time_log > 0: endepochtime = datetime.now() log.info('Epoch time {}:{}'.format( *divmod((endepochtime - epochtime).seconds, 60)))
torch.manual_seed(args.seed) scan_all = ge.load_scan_file('all', 'train') scan_all_var = ge.load_scan_var('all', 'train') input_symbols_scan = get_unique_words([c[0] for c in scan_all]) output_symbols_scan = get_unique_words([c[1] for c in scan_all]) all_symbols_scan = input_symbols_scan + output_symbols_scan all_lang = Lang(all_symbols_scan) ntoken = all_lang.n_symbols # set up transformer encoder-decoder model, loss, optimizer model = TransformerModel(ntoken=ntoken, emsize=args.emsize, nhead=args.nhead, nhid=args.nhid, nlayers=args.nlayers, dropout=args.dropout) model = nn.DataParallel(model).cuda() criterion = nn.NLLLoss().cuda() optimizer = torch.optim.Adam(model.parameters(), args.lr) if args.model_path: if os.path.isfile(args.model_path): print('Loading model at:', args.model_path) checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print("=> no checkpoint found at '{}'".format(args.model_path))
def main(): args = parse_args() if args.deterministic: random.seed(0) torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.gpu = 0 TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), init_token='<sos>', eos_token='<eos>', lower=False) train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits( TEXT, root=args.data_dir) TEXT.build_vocab(train_txt) model = TransformerModel(len(TEXT.vocab.stoi), args.em_size, args.num_heads, args.hid_size, args.num_layers).to(device) # model = torch.nn.DataParallel(model, dim=1) # optimiser = optim.Adam(model.parameters()) # optimiser = Ranger(model.parameters()) optimiser = RAdam(model.parameters()) if args.eval: dataloaders = { "test": DataLoader(TextEvalDataset(test_txt, args.ngram, TEXT), batch_size=args.eval_batch_size, shuffle=False) } if args.resume: resume(model, args) test_loss, test_acc = eval_pll(device, model, dataloaders["test"], args) logger.info(f"Eval: Test Loss = {test_loss}, Test Acc = {test_acc}") else: dataloaders = { "train": DataLoader(TextTrainDataset(train_txt, args.ngram, TEXT, args.poisson_rate), batch_size=args.train_batch_size, shuffle=True), "val": DataLoader(TextEvalDataset(val_txt, args.ngram, TEXT), batch_size=args.eval_batch_size, shuffle=False), "test": DataLoader(TextEvalDataset(test_txt, args.ngram, TEXT), batch_size=args.eval_batch_size, shuffle=False) } args.start_epoch = 0 args.best_acc = 1 / args.ngram if args.resume: resume(model, args, optimiser) # Create folder for the current model and save args model_dir = time.ctime().replace(" ", "_").replace(":", "_") args.model_dir = os.path.join("models", model_dir) os.makedirs(args.model_dir, exist_ok=True) with open(os.path.join(args.model_dir, "args.json"), "w") as f: json.dump(args.__dict__, f, indent=2) args.logger = logger train_pll(device, model, optimiser, dataloaders, args)
device=device, sort=False, sort_within_batch=False) agg_test_iterator = BucketIterator(agg_test, batch_size=batch_size, device=device, sort=False, sort_within_batch=False) # ================================================================ # Load model model = TransformerModel( len(IN_TEXT.vocab), t_conf["model"]["params"]["d_model"], # emb_size len(OUT_TEXT.vocab), pretrained_vectors=None, nhead=t_conf["model"]["params"]["nhead"], num_encoder_layers=t_conf["model"]["params"]["num_encoder_layer"], num_decoder_layers=t_conf["model"]["params"]["num_decoder_layer"], dim_feedforward=t_conf["model"]["params"]["dim_feedforward"], dropout=t_conf["model"]["params"]["dropout"]).to(device) model.load_state_dict(torch.load(model_filepath, map_location=device)) # sumeval evaluator evaluator = SumEvaluator(metrics=t_conf["metrics"], stopwords=False, lang="en") # Old script used t_conf["training"]["gen_maxlen"] gen_maxlen = g_conf["gen_maxtoken"] ## 1. Generation for each entity in "aggregated" test_{}.csv
train_data, val_data, test_data, vocab = dh.get_data() #Hyper params n_tokens = len(vocab.stoi) # the size of vocabulary emb_size = 512 # embedding size n_hidden = 200 # dimension of the FF network inside the transformer n_layers = 2 # number of transformer layers n_heads = 2 # the number of heads in the multiheadattention models dropout = 0.2 # dropout percentage criterion = nn.CrossEntropyLoss() #loss function lr = 5.0 # learning rate # Creating the model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) model = TransformerModel(n_tokens, emb_size, n_heads, n_hidden, n_layers, dropout).to(device) optimizer = torch.optim.SGD(model.parameters(), lr=lr) #Optimizer scheduler = torch.optim.lr_scheduler.StepLR( optimizer, 1, gamma=0.95) #Scheduler for the optimizer def train(model): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() src_mask = model.generate_square_subsequent_mask(dh.bptt).to(device) for batch, i in enumerate(range(0, train_data.size(0) - 1, dh.bptt)): data, targets = dh.get_batch(train_data, i) optimizer.zero_grad() if data.size(0) != dh.bptt:
eos_token='<eos>', lower=True) train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) train_iter, val_iter, test_iter = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size) print(len(train_iter)) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) # Create model model = TransformerModel(len(SRC.vocab), len(TRG.vocab), args.d_model, args.n_head, args.num_enc_layers, args.num_dec_layers, args.dim_feedforword, args.dropout, args.activation).to(device) if args.resume_model is not None: start_epoch, best_wer = resume_model(model, args.resume_model) # Run the model parallelly if torch.cuda.device_count() > 1: logger.info("Using {} GPUs".format(torch.cuda.device_count())) model = nn.DataParallel(model) # Create loss criterion & optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # Start training logger.info("Training Started".center(60, '#')) for epoch in range(start_epoch, args.epochs): # Train the model
.with_ascii_quotes_replacement()\ .with_possessive_elimination()\ .with_punct_removal()\ .with_stopwords_removal()\ .with_digit_removal()\ .build() clean_train_df, edited_col_name_train = preprocessor.preprocess(train_df) clean_test_df, edited_col_name_test = preprocessor.preprocess(test_df) # We set our training data and test data training_data = clean_train_df[edited_col_name_train] test_data = clean_test_df[edited_col_name_test] # Create tokenizer, model model = TransformerModel.TransformerModelBuilder().build() print(model) print("Model initialised.") # Prepare the dataset train_X = model.tokenize(training_data.to_list()) train_dataset = Task1Dataset(train_X, train_df['meanGrade']) model.to(device) train_proportion = 0.8 train_examples = round(len(train_dataset) * train_proportion) dev_examples = len(train_dataset) - train_examples train_dataset, dev_dataset = random_split(train_dataset,
class Predictor(PredictorBase): def __init__(self, config): super(Predictor, self).__init__(config) self.model = None self.config = config self.word_to_index, self.label_to_index = self.load_vocab() self.index_to_label = { value: key for key, value in self.label_to_index.items() } self.vocab_size = len(self.word_to_index) self.word_vectors = None self.sequence_length = self.config["sequence_length"] # 创建模型 self.create_model() # 加载计算图 self.load_graph() def load_vocab(self): # 将词汇-索引映射表加载出来 with open(os.path.join(self.output_path, "word_to_index.pkl"), "rb") as f: word_to_index = pickle.load(f) with open(os.path.join(self.output_path, "label_to_index.pkl"), "rb") as f: label_to_index = pickle.load(f) return word_to_index, label_to_index def sentence_to_idx(self, sentence): """ 将分词后的句子转换成idx表示 :param sentence: :return: """ sentence_ids = [ self.word_to_index.get(token, self.word_to_index["<UNK>"]) for token in sentence ] sentence_pad = sentence_ids[: self.sequence_length] if len(sentence_ids) > self.sequence_length \ else sentence_ids + [0] * (self.sequence_length - len(sentence_ids)) return sentence_pad def load_graph(self): """ 加载计算图 :return: """ self.sess = tf.Session() ckpt = tf.train.get_checkpoint_state( os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), self.config["ckpt_model_path"])) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters..') self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: raise ValueError('No such file:[{}]'.format( self.config["ckpt_model_path"])) def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ if self.config["model_name"] == "textcnn": self.model = TextCnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm": self.model = BiLstmModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm_atten": self.model = BiLstmAttenModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "rcnn": self.model = RcnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "transformer": self.model = TransformerModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) def predict(self, sentence): """ 给定分词后的句子,预测其分类结果 :param sentence: :return: """ sentence_ids = self.sentence_to_idx(sentence) prediction = self.model.infer(self.sess, [sentence_ids]).tolist()[0] label = self.index_to_label[prediction[0]] return label
class Trainer(TrainerBase): def __init__(self, args): super(Trainer, self).__init__() self.args = args with open( os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), args.config_path), "r") as fr: self.config = json.load(fr) self.train_data_obj = None self.eval_data_obj = None self.model = None # save_path模型保存目录 self.save_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), self.config["ckpt_model_path"]) if not os.path.exists(self.save_path): os.makedirs(self.save_path) # self.builder = tf.saved_model.builder.SavedModelBuilder("../pb_model/weibo/bilstm/savedModel") # 加载数据集 self.load_data() self.train_inputs, self.train_labels, label_to_idx = self.train_data_obj.gen_data( ) print("train data size: {}".format(len(self.train_labels))) self.vocab_size = self.train_data_obj.vocab_size print("vocab size: {}".format(self.vocab_size)) self.word_vectors = self.train_data_obj.word_vectors self.label_list = [value for key, value in label_to_idx.items()] self.eval_inputs, self.eval_labels = self.eval_data_obj.gen_data() print("eval data size: {}".format(len(self.eval_labels))) print("label numbers: ", len(self.label_list)) # 初始化模型对象 self.create_model() def load_data(self): """ 创建数据对象 :return: """ # 生成训练集对象并生成训练数据 self.train_data_obj = TrainData(self.config) # 生成验证集对象和验证集数据 self.eval_data_obj = EvalData(self.config) def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ if self.config["model_name"] == "textcnn": self.model = TextCnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm": self.model = BiLstmModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm_atten": self.model = BiLstmAttenModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "rcnn": self.model = RcnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "transformer": self.model = TransformerModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) def train(self): """ 训练模型 :return: """ gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True) sess_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options) with tf.Session(config=sess_config) as sess: # 初始化变量值 sess.run(tf.global_variables_initializer()) current_step = 0 eval_loss_lis = [0] # 创建train和eval的summary路径和写入对象 train_summary_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), self.config["output_path"] + "/summary/train") if not os.path.exists(train_summary_path): os.makedirs(train_summary_path) train_summary_writer = tf.summary.FileWriter( train_summary_path, sess.graph) eval_summary_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), self.config["output_path"] + "/summary/eval") if not os.path.exists(eval_summary_path): os.makedirs(eval_summary_path) eval_summary_writer = tf.summary.FileWriter( eval_summary_path, sess.graph) for epoch in range(self.config["epochs"]): print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) for batch in self.train_data_obj.next_batch( self.train_inputs, self.train_labels, self.config["batch_size"]): summary, loss, predictions = self.model.train( sess, batch, self.config["keep_prob"], self.config['learning_rate']) train_summary_writer.add_summary(summary) current_step += 1 if self.config[ "num_classes"] == 1 and current_step % self.config[ "print_every"] == 0: acc, auc, recall, prec, f_beta = get_binary_metrics( pred_y=predictions, true_y=batch["y"]) print( "train: step: {}, loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}" .format(current_step, loss, acc, auc, recall, prec, f_beta)) elif self.config[ "num_classes"] > 1 and current_step % self.config[ "print_every"] == 0: acc, recall, prec, f_beta = get_multi_metrics( pred_y=predictions, true_y=batch["y"], labels=self.label_list) print( "train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}" .format(current_step, loss, acc, recall, prec, f_beta)) #每训练一个epoch输出验证集的评测结果 if self.eval_data_obj: eval_losses = [] eval_accs = [] eval_aucs = [] eval_recalls = [] eval_precs = [] eval_f_betas = [] for eval_batch in self.eval_data_obj.next_batch( self.eval_inputs, self.eval_labels, self.config["batch_size"]): eval_summary, eval_loss, eval_predictions = self.model.eval( sess, eval_batch) eval_summary_writer.add_summary(eval_summary) eval_losses.append(eval_loss) if self.config["num_classes"] == 1: acc, auc, recall, prec, f_beta = get_binary_metrics( pred_y=eval_predictions, true_y=eval_batch["y"]) eval_accs.append(acc) eval_aucs.append(auc) eval_recalls.append(recall) eval_precs.append(prec) eval_f_betas.append(f_beta) elif self.config["num_classes"] > 1: acc, recall, prec, f_beta = get_multi_metrics( pred_y=eval_predictions, true_y=eval_batch["y"], labels=self.label_list) eval_accs.append(acc) eval_recalls.append(recall) eval_precs.append(prec) eval_f_betas.append(f_beta) eval_loss_lis.append(mean(eval_losses)) print("\n") print( "eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}" .format(mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls), mean(eval_precs), mean(eval_f_betas))) print("\n") if self.config["ckpt_model_path"] and eval_loss_lis[ -1] >= max(eval_loss_lis): #self.model_save_path是模型保存具体的名字 self.model_save_path = os.path.join( self.save_path, self.config["model_name"]) self.model.saver.save(sess, self.model_save_path, global_step=epoch + 1) elif self.config["ckpt_model_path"] and eval_loss_lis[ -1] < max(eval_loss_lis): if self.config['batch_size'] <= 256: self.config['batch_size'] *= 2 if self.config['learning_rate'] <= 0.00001: self.config['learning_rate'] *= 0.95 print( "epoch: {} lr: {} self.batch_size: {}".format( epoch, self.lr, self.batch_size)) self.save_path = tf.train.latest_checkpoint( self.save_path) print('最新加载的模型路径{}'.format(self.save_path)) else: print('learn_rate 小于0.00001,训练结束')
class LanguageModel(LightningModule): def __init__(self, hparams: dict(), **kwargs) -> 'LightningTemplateModel': # init superclass super().__init__(**kwargs) self.save_hyperparameters() self.hparams = hparams if self.hparams.model == 'awd': self.model = WDLSTM( self.hparams.num_tokens, num_layers=self.hparams.num_layers, num_hidden=self.hparams.num_hidden, num_embedding=self.hparams.num_embedding, tie_weights=self.hparams.tie_weights, embedding_dropout=self.hparams.embedding_dropout, input_dropout=self.hparams.input_dropout, hidden_dropout=self.hparams.hidden_dropout, output_dropout=self.hparams.output_dropout, weight_dropout=self.hparams.weight_dropout) self.model( torch.zeros(self.hparams.bptt, self.hparams.batch_size).long(), self.model.init_hidden(self.hparams.batch_size)) elif self.hparams.model == 'rnn': self.model = RNNModel(self.hparams.rnn_type, self.hparams.num_tokens, num_embedding=self.hparams.num_embedding, num_hidden=self.hparams.num_hidden, num_layers=self.hparams.num_layers, dropout=self.hparams.dropout, tie_weights=self.hparams.tie_weights) elif self.hparams.model == 'transformer': self.model = TransformerModel( self.hparams.num_tokens, num_embedding=self.hparams.num_embedding, num_hidden=self.hparams.num_hidden, num_layers=self.hparams.num_layers, dropout=self.hparams.dropout, num_heads=self.hparams.num_heads) else: raise ValueError(f'Model {self.hparams.model} not recognized.') self.hiddens = None self.criterion = torch.nn.NLLLoss() self.avg_loss = 0 def forward(self, x, hiddens=None): if self.hparams.model != 'transformer': return self.model(x, hiddens) return self.model(x) def on_train_epoch_start(self): self.train_len = len( self.train_dataloader().batch_sampler) * self.hparams.bptt if self.hparams.model != 'transformer': self.hiddens = self.model.init_hidden(self.hparams.batch_size) def training_step(self, batch, batch_idx): x, y = batch if self.hparams.model == 'awd': self.hiddens = repackage_hidden(self.hiddens) out, self.hiddens, (hs, dropped_hs) = self(x, self.hiddens) elif self.hparams.model == 'rnn': self.hiddens = repackage_hidden( self.hiddens) if self.hiddens else self.hiddens out, self.hiddens = self(x, self.hiddens) elif self.hparams.model == 'transformer': out = self(x) raw_loss = self.criterion(out, y) loss = raw_loss # The AR and TAR loss are only applied to the output of the final # RNN layer, not to all layers if self.hparams.model == 'awd': # WARNING: It is implementing here \ell_2^2 instead of \ell_2 # Activation Regularization if self.hparams.alpha > 0: loss += self.hparams.alpha * dropped_hs[-1].pow(2).mean() # Temporal Activation Regularization (slowness) if self.hparams.beta > 0: loss += self.hparams.beta * \ (hs[-1][1:] - hs[-1][:-1]).pow(2).mean() ppl = torch.exp(raw_loss) bpc = raw_loss / math.log(2) self.log('train_loss', loss) self.log('train_ppl', ppl, prog_bar=True) self.log('train_bpc', bpc, prog_bar=True) return loss def on_validation_epoch_start(self): self.val_len = len( self.val_dataloader().batch_sampler) * self.hparams.bptt if self.hparams.model != 'transformer': self.hiddens = self.model.init_hidden(self.hparams.batch_size) def validation_step(self, batch, batch_idx): x, y = batch if self.hparams.model == 'awd': self.hiddens = repackage_hidden(self.hiddens) out, self.hiddens, (hs, dropped_hs) = self(x, self.hiddens) elif self.hparams.model == 'rnn': self.hiddens = repackage_hidden( self.hiddens) if self.hiddens else self.hiddens out, self.hiddens = self(x, self.hiddens) elif self.hparams.model == 'transformer': out = self(x) loss = self.criterion(out, y) self.log('val_loss', len(x) * loss, prog_bar=True, reduce_fx=lambda x: torch.sum(x) / self.val_len) self.log('val_bpc', len(x) * loss, prog_bar=True, reduce_fx=lambda x: (torch.sum(x) / self.val_len) / math.log(2)) self.log('val_ppl', len(x) * loss, prog_bar=True, reduce_fx=lambda x: torch.exp(torch.sum(x) / self.val_len)) return loss def on_test_epoch_start(self): self.test_len = len( self.test_dataloader().batch_sampler) * self.hparams.bptt if self.hparams.model != 'transformer': self.hiddens = self.model.init_hidden(self.hparams.batch_size) def test_step(self, batch, batch_idx): x, y = batch if self.hparams.model == 'awd': self.hiddens = repackage_hidden(self.hiddens) out, self.hiddens, (hs, dropped_hs) = self(x, self.hiddens) elif self.hparams.model == 'rnn': self.hiddens = repackage_hidden( self.hiddens) if self.hiddens else self.hiddens out, self.hiddens = self(x, self.hiddens) elif self.hparams.model == 'transformer': out = self(x) loss = self.criterion(out, y) self.log('test_loss', len(x) * loss, prog_bar=True, reduce_fx=lambda x: torch.sum(x) / self.test_len) self.log('test_bpc', len(x) * loss, prog_bar=True, reduce_fx=lambda x: (torch.sum(x) / self.test_len) / math.log(2)) self.log('test_ppl', len(x) * loss, prog_bar=True, reduce_fx=lambda x: torch.exp(torch.sum(x) / self.test_len)) return loss def configure_optimizers(self): """ Return whatever optimizers and learning rate schedulers you want here. At least one optimizer is required. WARNING: The paper use a variation of ASGD, called non-monotonically triggered ASGD (Algorithm 1), which is not implemented yet, They used L to be the number of iterations in an epoch (i.e., after training epoch ends) and n=5. """ if self.hparams.optimizer == 'sgd': optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) if self.hparams.optimizer == 'adam': optimizer = torch.optim.Adam( self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) # scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, self.hparams.multi_step_lr_milestones, gamma=0.1) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=self.hparams.learning_rate, epochs=self.hparams.max_epochs, steps_per_epoch=len(self.train_dataloader())) return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}] @staticmethod def add_model_specific_args(parent_parser): parser = ArgumentParser(parents=[parent_parser], add_help=False) parser.add_argument('--num-embedding', type=int, default=400, help='size of word embeddings') parser.add_argument('--num-hidden', type=int, default=1150, help='number of hidden units per layer') parser.add_argument('--num-layers', type=int, default=3, help='number of layers') parser.add_argument('--learning_rate', '--learning-rate', type=float, default=30.0, help='initial learning rate') parser.add_argument('--batch-size', type=int, default=80, metavar='N', help='batch size') parser.add_argument('--bptt', type=int, default=70, help='sequence length') parser.add_argument('--output-dropout', type=float, default=0.4, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--hidden-dropout', type=float, default=0.3, help='dropout for rnn layers (0 = no dropout)') parser.add_argument( '--input-dropout', type=float, default=0.65, help='dropout for input embedding layers (0 = no dropout)') parser.add_argument( '--embedding-dropout', type=float, default=0.1, help='dropout to remove words from embedding layer ' '(0 = no dropout)') parser.add_argument( '--weight-dropout', type=float, default=0.5, help='amount of weight dropout to apply to the RNN hidden to ' 'hidden matrix') parser.add_argument( '--alpha', type=float, default=0, help='alpha L2 regularization on RNN activation (alpha = 0 means' ' no regularization)') parser.add_argument( '--beta', type=float, default=0, help='beta slowness regularization applied on RNN activiation ' '(beta = 0 means no regularization)') parser.add_argument('--weight-decay', type=float, default=1.2e-6, help='weight decay applied to all weights') parser.add_argument('--optimizer', type=str, default='sgd', help='optimizer to use (sgd, adam)') parser.add_argument( '--no-tie-weights', dest='tie_weights', default=True, action='store_false', help='if set, does not tie the input/output embedding weights') parser.add_argument('--rnn-type', choices=['LSTM', 'GRU', 'RNN_TANH', 'RNN_RELU'], default='LSTM') parser.add_argument('--dropout', type=float, default=0.2) parser.add_argument( '--num-heads', type=int, default=2, help='the number of heads in the encoder/decoder of the ' ' transformer model') return parser
def main(args): since = time.time() output_dir = os.path.join(os.getcwd(), 'outputs') os.makedirs(output_dir, exist_ok=True) data_loaders = get_dataloader( input_dir=args.input_dir, which_challenge='3rd_challenge', phases=['test'], max_frame_length=args.max_frame_length, max_vid_label_length=args.max_vid_label_length, max_seg_label_length=args.max_seg_label_length, rgb_feature_size=args.rgb_feature_size, audio_feature_size=args.audio_feature_size, batch_size=args.batch_size, num_workers=args.num_workers) model = TransformerModel( n_layers=args.n_layers, n_heads=args.n_heads, rgb_feature_size=args.rgb_feature_size, audio_feature_size=args.audio_feature_size, d_rgb=args.d_rgb, d_audio=args.d_audio, d_model=args.d_model, d_ff=args.d_ff, d_proj=args.d_proj, n_attns = args.n_attns, num_classes=args.num_classes, dropout=args.dropout) model = model.to(device) checkpoint = torch.load(os.path.join(os.getcwd(), 'models/model-epoch-04.ckpt')) model.load_state_dict(checkpoint['state_dict']) model.eval() df_outputs = {i: pd.DataFrame(columns=['vid_id', 'vid_label_pred', 'vid_prob', 'seg_label_pred', 'seg_prob']) \ for i in range(1, args.num_classes+1)} for idx, (vid_ids, frame_lengths, frame_rgbs, frame_audios, vid_labels, seg_labels, seg_times) \ in enumerate(data_loaders['test']): if idx%10 == 0: print('idx:', idx) # frame_rgbs: [batch_size, frame_length, rgb_feature_size] # frame_audios: [batch_size, frame_length, audio_feature_size] frame_rgbs = frame_rgbs.to(device) frame_audios = frame_audios.to(device) batch_size = frame_audios.size(0) # vid_probs: [batch_size, num_classes] # attn_idc: [batch_size, num_classes] # scores: [batch_size, max_seg_length, n_attns] # attn_weights: [batch_size, max_seg_length, n_attns] vid_probs, attn_idc, scores, attn_weights, conv_loss = model(frame_rgbs, frame_audios, device) # vid_probs: [batch_size, vid_pred_length] # vid_label_preds: [batch_size, vid_pred_length] vid_probs, vid_label_preds = torch.topk(vid_probs, args.vid_pred_length) vid_label_preds = vid_label_preds + 1 # attn_idc: [batch_size, num_classes+1] zeros = torch.zeros(batch_size, 1).long().to(device) attn_idc = torch.cat((zeros, attn_idc), dim=1) # selected_attn_idc: [batch_size, vid_pred_length] selected_attn_idc = torch.gather(attn_idc, 1, vid_label_preds) # attn_weights: [batch_size, n_attns, max_seg_length] attn_weights = attn_weights.transpose(1, 2) # selected_attn_weights: [batch_size, vid_pred_length, max_seg_length] selected_attn_weights = batched_index_select(attn_weights, 1, selected_attn_idc) # seg_probs: [batch_size, vid_pred_length, seg_pred_length] # seg_label_preds: [batch_size, vid_pred_length, seg_pred_length] seg_probs, seg_label_preds = torch.topk(selected_attn_weights, args.seg_pred_length) seg_label_preds = seg_label_preds + 1 # seg_prob_min, seg_prob_max: [batch_size, vid_pred_length] seg_prob_min, _ = seg_probs.min(dim=2) seg_prob_max, _ = seg_probs.max(dim=2) # seg_prob_min, seg_prob_max: [batch_size, vid_pred_length, seg_pred_length] seg_prob_min = seg_prob_min.unsqueeze(2).expand(batch_size, args.vid_pred_length, args.seg_pred_length) seg_prob_max = seg_prob_max.unsqueeze(2).expand(batch_size, args.vid_pred_length, args.seg_pred_length) # seg_probs: [batch_size, vid_pred_length, seg_pred_length] seg_probs = (seg_probs - seg_prob_min) / (seg_prob_max - seg_prob_min + 1e-6) # To save predictions, converted to numpy data. vid_probs = vid_probs.cpu().detach().numpy() vid_label_preds = vid_label_preds.cpu().numpy() seg_probs = seg_probs.cpu().detach().numpy() seg_label_preds = seg_label_preds.cpu().numpy() for i in range(batch_size): for j in range(args.vid_pred_length): vid_label_pred = vid_label_preds[i][j] df_outputs[vid_label_pred] = df_outputs[vid_label_pred].append( {'vid_id': vid_ids[i], 'vid_label_pred': vid_label_pred, 'vid_prob': vid_probs[i][j], 'seg_label_pred': list(seg_label_preds[i][j]), 'seg_prob': list(seg_probs[i][j])}, ignore_index=True) for i in range(1, args.num_classes+1): df_outputs[i].to_csv(os.path.join(output_dir, '%04d.csv'%i), index=False) time_elapsed = time.time() - since print('=> Running time in a epoch: {:.0f}h {:.0f}m {:.0f}s' .format(time_elapsed // 3600, (time_elapsed % 3600) // 60, time_elapsed % 60))
def generate_summary(model, tokenizer, document, decoder): """ Generates a summary for a single document Parameters ---------- model: ``BartForConditionalGeneration`` A BART model that has been fine-tuned for summarization tokenizer: ``BartForConditionalGeneration``: A corresponding BART tokenizer document: ``str`` A single document to be summarized decoder: ``str`` The decoder to use for decoding Returns: ---------- summary: ``str`` A generated summary of the input document summary_score: ``float`` The log-probability score of the summary """ input_ids = tokenizer(document, truncation=True, return_tensors='pt')['input_ids'] metadata = {'input_ids': input_ids} model_wrapper = TransformerModel(model) if decoder == 'greedy': top_candidate = decoders.greedy_decoding( model=model_wrapper, max_length=50, eos_id=tokenizer.eos_token_id, decoded_ids=[tokenizer.bos_token_id], metadata=metadata) elif decoder == 'beam_search': top_candidate = decoders.beam_search_decoding( model=model_wrapper, beam_size=3, max_length=50, eos_id=tokenizer.eos_token_id, decoded_ids=[tokenizer.bos_token_id], metadata=metadata)[0] elif decoder == 'random': # Random sampling top_candidate = decoders.top_k_sampling( model=model_wrapper, top_k=int(1e9), # random sampling is top-K with large K temperature=1, max_length=50, eos_id=tokenizer.eos_token_id, decoded_ids=[tokenizer.bos_token_id], metadata=metadata) elif decoder == 'top_k': top_candidate = decoders.top_k_sampling( model=model_wrapper, top_k=3, temperature=0.5, max_length=50, eos_id=tokenizer.eos_token_id, decoded_ids=[tokenizer.bos_token_id], metadata=metadata) elif decoder == 'nucleus': top_candidate = decoders.nucleus_sampling( model=model_wrapper, top_p=0.2, max_length=50, eos_id=tokenizer.eos_token_id, decoded_ids=[tokenizer.bos_token_id], metadata=metadata) summary_ids = top_candidate.decoded_ids summary = tokenizer.decode(summary_ids, skip_special_tokens=True) summary_score = top_candidate.score return summary, summary_score
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) model_type = args.model # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # char_vectors = load_char_vectors # Get model log.info('Building model...') if model_type == 'BiDAFplus': # model = BiDAFplus(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, params=get_params(model_type, args.params)) elif model_type == 'BiDAFbase': model = BiDAFbase(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif model_type == "Transformer": model = TransformerModel(word_vectors=word_vectors, char_vectors=char_vectors, input_size=len(word_vectors), hidden_size=args.hidden_size) elif model_type == 'BiDAF': model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, params=get_params(model_type, args.params)) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) if model_type == 'BiDAF' or model_type == 'BiDAFplus': cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs) # Forward else: log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])