def extract_sentiment_words(): # create vocabulary using wikitext2 train_txt, _, _ = torchtext.datasets.WikiText2.splits(TEXT) TEXT.build_vocab(train_txt) start = time.time() x_train, y_train, x_val, y_val, rtrain, rtest = preprocess() end = time.time() print("PREPROCESSING TIME: {}".format(end - start)) ntokens = len(TEXT.vocab.stoi) # the size of vocabulary # FIXME set up batched examples for better generality # batch_size = 20 # eval_batch_size = 10 # configs emsize = 200 # embedding dimension nhid = 200 # feedforward dimension nlayers = 2 # n encoders nhead = 2 # multiattention heads dropout = 0.2 # the dropout value # initialize main torch vars model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) criterion = nn.CrossEntropyLoss().to(device) lr = 0.05 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) best_val_loss = float("inf") epochs = 50 best_model = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train_model(x_train, y_train, model, criterion, optimizer, scheduler, epoch) val_loss = evaluate(x_val, y_val,rtest, model,criterion) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() # test_loss = evaluate(best_model, criterion, test_data) # print('=' * 89) # print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( # test_loss, math.exp(test_loss))) # print('=' * 89) return best_model
def set_transformer_model(self): ''' This Function loads the base transformer model. Args: transformer_config_path : config path(yaml) of the transformer transformer_weights_path : optional . if given loads the weight as well Returns:None ''' # load base transformer model from config with open(self.args.transformer_config_path, 'r') as file: config= yaml.load(file, yaml.FullLoader) model_config = TransformerConfig(config) input_dim = config['transformer']['input_dim'] dr= model_config.downsample_rate hidden_size = model_config.hidden_size output_attention= False base_transformer_model = TransformerModel(model_config,input_dim,output_attentions=output_attention).to('cpu') #load weights if self.args.transformer_weights_path: ckpt = torch.load(self.args.transformer_weights_path, map_location='cpu') base_transformer_model.load_state_dict(ckpt['Transformer']) self.base_transformer_model = base_transformer_model
def build_model(self): self.model = TransformerModel(self.opt, self.dict) # todo if self.opt['embedding_type'] != 'random': pass if self.opt['load_dict'] is not None: logger.info('[ Loading existing model params from {} ]' ''.format(self.opt['load_dict'])) self.model.load_model(self.opt['load_dict']) if self.use_cuda: self.model.to(self.device)
def evaluate(sentence): sentence = preprocess_sentence(sentence) sentence = tf.expand_dims(START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0) output = tf.expand_dims(START_TOKEN, 0) test_Transformer = TransformerModel(max_length=MAX_LENGTH, vocab_size=VOCAB_SIZE, embedding_matrix=emb_matrix) test_model = test_Transformer.model test_model.load_weights(checkpoint_path) for i in range(MAX_LENGTH): predictions = model(inputs=[sentence, output], training=False) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, END_TOKEN[0]): break # concatenated the predicted_id to the output which is given to the decoder # as its input output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0)
def inference(): eval_model = TransformerModel.load_from_checkpoint( './lightning_logs/version_0/checkpoints/epoch=8-step=539.ckpt', d_model=250, n_heads=10, n_layers=1) eval_model.freeze() n_steps = 1000 test_data = pd.read_csv('./data/toy_data/test.csv').to_numpy() train_data = pd.read_csv('./data/toy_data/train.csv').to_numpy() from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(-1, 1)) scaler.fit(train_data) test_data = torch.tensor(scaler.transform(test_data).reshape(-1)).float() with torch.no_grad(): for i in range(0, n_steps): # data = torch.cat((test_data[-99:], torch.tensor([0]).float())) data = test_data[-100:, ] output = eval_model(data.reshape(-1, 1).unsqueeze(-1)) output = torch.flatten(output) test_data = torch.cat((test_data, output[-1:])) test_data = test_data.cpu().view(-1) # I used this plot to visualize if the model pics up any long therm struccture within the data. plt.plot(test_data[600:], color="red") plt.plot(test_data[600:1000], color="blue") plt.grid(True, which='both') plt.axhline(y=0, color='k') plt.show() pass
def main(_): vocab_path = FLAGS.vocab_path model_dir = FLAGS.model_dir encoder_stack_size = FLAGS.encoder_stack_size decoder_stack_size = FLAGS.decoder_stack_size hidden_size = FLAGS.hidden_size num_heads = FLAGS.num_heads filter_size = FLAGS.filter_size dropout_rate = FLAGS.dropout_rate extra_decode_length = FLAGS.extra_decode_length beam_width = FLAGS.beam_width alpha = FLAGS.alpha decode_batch_size = FLAGS.decode_batch_size src_max_length = FLAGS.src_max_length source_text_filename = FLAGS.source_text_filename target_text_filename = FLAGS.target_text_filename translation_output_filename = FLAGS.translation_output_filename # transformer model subtokenizer = tokenization.restore_subtokenizer_from_vocab_files( vocab_path) vocab_size = subtokenizer.vocab_size model = TransformerModel(vocab_size=vocab_size, encoder_stack_size=encoder_stack_size, decoder_stack_size=decoder_stack_size, hidden_size=hidden_size, num_heads=num_heads, filter_size=filter_size, dropout_rate=dropout_rate, extra_decode_length=extra_decode_length, beam_width=beam_width, alpha=alpha) ckpt = tf.train.Checkpoint(model=model) latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt is None: raise ValueError('No checkpoint is found in %s' % model_dir) print('Loaded latest checkpoint ', latest_ckpt) ckpt.restore(latest_ckpt).expect_partial() # build evaluator evaluator = SequenceTransducerEvaluator(model, subtokenizer, decode_batch_size, src_max_length) # translates input sequences, and optionally evaluates BLEU score if # groundtruth target sequences are provided if target_text_filename is not None: case_insensitive_score, case_sensitive_score = evaluator.evaluate( source_text_filename, target_text_filename, translation_output_filename) print('BLEU(case insensitive): %f' % case_insensitive_score) print('BLEU(case sensitive): %f' % case_sensitive_score) else: evaluator.translate(source_text_filename, translation_output_filename) print( 'Inference mode: no groundtruth translations.\nTranslations written ' 'to file "%s"' % translation_output_filename)
def main(_): data_dir = FLAGS.data_dir vocab_path = FLAGS.vocab_path model_dir = FLAGS.model_dir encoder_stack_size = FLAGS.encoder_stack_size decoder_stack_size = FLAGS.decoder_stack_size hidden_size = FLAGS.hidden_size num_heads = FLAGS.num_heads filter_size = FLAGS.filter_size dropout_rate = FLAGS.dropout_rate max_num_tokens = FLAGS.max_num_tokens max_length = FLAGS.max_length num_parallel_calls = FLAGS.num_parallel_calls learning_rate = FLAGS.learning_rate learning_rate_warmup_steps = FLAGS.learning_rate_warmup_steps optimizer_adam_beta1 = FLAGS.optimizer_adam_beta1 optimizer_adam_beta2 = FLAGS.optimizer_adam_beta2 optimizer_adam_epsilon = FLAGS.optimizer_adam_epsilon label_smoothing = FLAGS.label_smoothing num_steps = FLAGS.num_steps save_ckpt_per_steps = FLAGS.save_ckpt_per_steps # transformer model subtokenizer = tokenization.restore_subtokenizer_from_vocab_files( vocab_path) vocab_size = subtokenizer.vocab_size model = TransformerModel(vocab_size=vocab_size, encoder_stack_size=encoder_stack_size, decoder_stack_size=decoder_stack_size, hidden_size=hidden_size, num_heads=num_heads, filter_size=filter_size, dropout_rate=dropout_rate) # training dataset builder = dataset.DynamicBatchDatasetBuilder(max_num_tokens, True, max_length, num_parallel_calls) filenames = sorted(glob.glob(os.path.join(data_dir, SUFFIX))) train_ds = builder.build_dataset(filenames) # learning rate and optimizer optimizer = tf.keras.optimizers.Adam(utils.LearningRateSchedule( learning_rate, hidden_size, learning_rate_warmup_steps), optimizer_adam_beta1, optimizer_adam_beta2, epsilon=optimizer_adam_epsilon) # checkpoint ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer) # build trainer and start training trainer = SequenceTransducerTrainer(model, label_smoothing) trainer.train(train_ds, optimizer, ckpt, model_dir, num_steps, save_ckpt_per_steps)
def train(opt, train_data, eval_data=None): logger.info("start training task") dim_input = 6 dim_emb = 64 num_class = train_data.num_class transformer_nhead = 2 transformer_nlayers = 1 model = TransformerModel(dim_input, dim_emb, transformer_nhead, num_class, transformer_nlayers) if model.cuda: model = move_to_gpu(model) summary(model, train_data[0]['x'].shape) try: dataloader = DataLoader( train_data, batch_size=opt.batch_size, shuffle=False, num_workers=4 ) logger.info("create training dataloader") except Exception as e: logger.error("fail to create dataloader", e) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=model.optimizer, milestones=[5, 10], gamma=0.1) model_path = os.path.join(opt.model_dir,opt.model_name+".pth") global_steps = 0 best = 0 for epoch in tqdm(list(range(opt.epoch)), desc='epoch'): for step, batch in enumerate(dataloader): global_steps += 1 metrics = model.train(batch) if global_steps % opt.log_steps == 0: logger.debug(f"global steps={global_steps},{metrics}") if global_steps % opt.save_steps == 0: val_metrics, eval_result = eval(opt, model, eval_data) logger.info(f"global steps={global_steps}, current={val_metrics}, best={best}, result={eval_result}") if val_metrics > best: best = val_metrics torch.save(model.state_dict(), model_path) logger.info(f"global steps={global_steps}, save model:{model_path}") lr_scheduler.step()
def __init__(self, context: PyTorchTrialContext): self.context = context data_config = self.context.get_data_config() hparams = self.context.get_hparams() using_bind_mount = data_config["use_bind_mount"] use_cache = data_config["use_cache"] self.eval_batch_size = hparams["eval_batch_size"] download_directory = ( Path(data_config["bind_mount_path"]) if using_bind_mount else Path("/data")) / f"data-rank{self.context.distributed.get_rank()}" self.corpus = data.load_and_cache_dataset(download_directory, use_cache) self.model_cls = hparams["model_cls"] emsize = hparams["word_embeddings_size"] num_hidden = hparams["num_hidden"] num_layers = hparams["num_layers"] dropout = hparams["dropout"] self.bptt = hparams["bptt"] if self.model_cls.lower() == "transformer": num_heads = hparams["num_heads"] self.model = TransformerModel(self.corpus.ntokens, emsize, num_heads, num_hidden, num_layers, dropout) else: tied = hparams["tied"] self.model = RNNModel( self.model_cls, self.corpus.ntokens, emsize, num_hidden, num_layers, dropout, tied, ) self.model = self.context.wrap_model(self.model) self.criterion = nn.NLLLoss() lr = hparams["lr"] optimizer = torch.optim.SGD(self.model.parameters(), lr=lr) self.optimizer = self.context.wrap_optimizer(optimizer) self.lr_scheduler = self.context.wrap_lr_scheduler( torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, factor=0.25, patience=0, threshold=0.001, threshold_mode="abs", verbose=True, ), LRScheduler.StepMode.MANUAL_STEP, )
def __init__(self, mask, hps): super(Seq2Seq, self).__init__() self.hps = hps self.vocab_size = hps.vocab_size self.emb_dim = hps.emb_dim self.max_len = hps.max_len self.batch_size = hps.batch_size self.test_batch_size = hps.test_batch_size self.mask = mask args = DEFAULT_CONFIG shared_args = DEFAULT_SHARED_CONFIG self.irony_encoder = TransformerModel(args, self.vocab_size + self.max_len, self.max_len) self.non_encoder = TransformerModel(args, self.vocab_size + self.max_len, self.max_len) self.shared_encoder = SharedTransformerModel(shared_args, self.vocab_size + self.max_len, self.max_len) self.shared_decoder = SharedTransformerModel(shared_args, self.vocab_size + self.max_len, self.max_len) self.irony_decoder = TransformerDecoder(args, self.vocab_size + self.max_len, self.max_len, True) self.non_decoder = TransformerDecoder(args, self.vocab_size + self.max_len, self.max_len, True)
def main(args): random_seed(args.seed) device = torch.device("cuda" if args.cuda else "cpu") corpus = data.Corpus(args.data) train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, args.batch_size) test_data = batchify(corpus.test, args.batch_size) print('loaded data') print(f'number of unique tokens: {len(corpus.dictionary)}') ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = TransformerModel( ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = RNNModel( args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, steps_per_epoch=len(list(range(0, train_data.size( 0) - 1, args.bptt))), epochs=args.epochs, anneal_strategy='linear') print('initialized model and optimizer') train(args, model, optimizer, train_data, val_data, scheduler)
def main(): parser = argparse.ArgumentParser(description="Train GPT2 Model") parser.add_argument("--batch_size", type=int, default=4, help="Specify batch size") parser.add_argument("--num_epoch", type=int, default=3, help="Specify number of epochs") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Specify AdamW learning rate") args = parser.parse_args() setup = models.trav_trans.dataset.Setup("output", "output/train_dps.txt", "output/train_ids.txt") layers = [1, 3, 6, 9] for l in layers: model = TransformerModel( len(setup.vocab.idx2vocab), CrossEntropyLoss(ignore_index=setup.vocab.pad_idx), l, 300, 1000, 6, 1e-05) training_args = TrainingArgs(batch_size=args.batch_size, num_epoch=args.num_epoch, output_dir="output", optimizer=AdamW(model.parameters(), lr=args.learning_rate), save_model_on_epoch=False, suffix=f"{l}-layers") trainer = Trainer(model, setup, training_args) trainer.train()
def main(): args = get_args() args.n_gpu = 1 set_seed(args) # Construct tokenizer tokenizer = CharTokenizer([]) tokenizer.load(args.load_vocab) args.vocab_size = len(tokenizer) logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") # GPU setting os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Construct model model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) # Load data noisy_sents = read_strings(os.path.join('sejong_corpus', args.noisy_file)) clean_sents = read_strings(os.path.join('sejong_corpus', args.clean_file)) sents_annotation = ['None'] * len(noisy_sents) pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] # Train-validation split train_data, valid_data = train_test_split( pairs, test_size=args.val_ratio, random_state=args.seed) # test: about 1000 logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train(model, tokenizer, train_data, valid_data, args, eos=args.eos_setting)
def train(): # data module dm = TSDataModule("", seq_len=100, batch_size=32) dm.setup() # model model = TransformerModel(250, 10, 1) # trainer trainer = pl.Trainer(gradient_clip_val=0.7) trainer.fit(model=model, datamodule=dm) # prediction pass
def evaluate(sentence): sentence = preprocess_sentence(sentence) vocab_filename = "vocab_" + language + ".txt" tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file( vocab_filename) # Vocabulary size plus start and end token VOCAB_SIZE = tokenizer.vocab_size + 2 # Define start and end token to indicate the start and end of a sentence START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1] emb_matrix = load_embeddings(vocab_size=VOCAB_SIZE, tokenizer=tokenizer, language=language) Transformer = TransformerModel(max_length=MAX_LENGTH, vocab_size=VOCAB_SIZE, embedding_matrix=emb_matrix) sentence = tf.expand_dims(START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0) output = tf.expand_dims(START_TOKEN, 0) # Create a new basic model instance model = Transformer.model checkpoint_path = loadCheckpoint_chat(VOCAB_SIZE) try: model.load_weights(checkpoint_path) print("Model loaded from checkpoint " + checkpoint_path + "Loaded") except ValueError: print("Error loading checkpoint " + checkpoint_path) print("ValueError:" + str(ValueError)) for i in range(MAX_LENGTH): predictions = model(inputs=[sentence, output], training=False) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, END_TOKEN[0]): break # concatenated the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), tokenizer
def main(): parser = argparse.ArgumentParser(description="Train GPT2 Model") parser.add_argument("--batch_size", type=int, default=4, help="Specify batch size") parser.add_argument("--num_epoch", type=int, default=3, help="Specify number of epochs") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Specify AdamW learning rate") args = parser.parse_args() tokenizer = Tokenizer.from_file("output/tokenizer.json") dataset = Dataset("output/train_rq4_dps.txt") model = TransformerModel( tokenizer.get_vocab_size(), CrossEntropyLoss(ignore_index=tokenizer.encode("[PAD]").ids[0]), 6, 300, 1000, 6, 1e-05 ) training_args = TrainingArgs( batch_size = args.batch_size, num_epoch = args.num_epoch, output_dir = "output", optimizer = AdamW(model.parameters(), lr=args.learning_rate), save_model_on_epoch = False ) trainer = Trainer( model, dataset, tokenizer, training_args ) trainer.train()
def main(): voc_size = args.vocab_sz print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) print("Done") print("Inference:") num = args.input_start if num % 2 != 0: print("The input number must be an even number.") return if num > args.vocab_sz - MAX_LEN * 2: print("The input sequence may be out of range.") return input_nums = [num + i * 2 for i in range(MAX_LEN)] src = to_cuda(flow.tensor(input_nums)).unsqueeze(1) pred = [0] for i in range(MAX_LEN): inp = to_cuda(flow.tensor(pred)).unsqueeze(1) output = model(src, inp) out_num = output.argmax(2)[-1].numpy()[0] pred.append(out_num) print("input:", input_nums) print("pred:", pred)
def main(): model = TransformerModel(ntoken=100, ninp=8000, nhead=8, nhid=10000, nlayers=1).to('cuda') time_steps = 64 batch_size = 128 input = torch.zeros(time_steps, batch_size, dtype=torch.int64, device='cuda') with measure(): output = model(input) print(output[0, 0])
def main(): model = TransformerModel(ntoken=100, ninp=8000, nhead=8, nhid=10000, nlayers=1).to('cuda') time_steps = 64 batch_size = 128 input = torch.zeros(time_steps, batch_size, dtype=torch.int64, device='cuda') output = model(input) torch.cuda.synchronize() with measure(): for i in range(4): output = model(input) torch.cuda.synchronize()
def main(): print("Generating data...", end="") voc_size = args.vocab_sz inp = np.arange(2, voc_size, 2) tgt = np.arange(3, voc_size, 2) data_x, data_y = get_numbers(inp, tgt) train_len = int(len(data_x) * 0.9) train_x, val_x = data_x[:train_len], data_x[train_len:] train_y, val_y = data_y[:train_len], data_y[train_len:] print("Done") print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) if args.load_dir != ".": model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) criterion = to_cuda(nn.CrossEntropyLoss()) optimizer = flow.optim.Adam(model.parameters(), lr=args.lr) print("Done") print("Training...") min_loss = 100 for i in range(1, args.n_epochs + 1): epoch_loss = train(model, criterion, optimizer, train_x, train_y) epoch_loss_val = validation(model, criterion, val_x, val_y) print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss < min_loss: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) else: shutil.rmtree(args.save_dir) assert not os.path.exists(args.save_dir) os.mkdir(args.save_dir) flow.save(model.state_dict(), args.save_dir) if i % 3 == 2: print(test(model, test_times=10))
def main(model_name=None, hidden=64, nlayers=1): voc_size = 10000 inp = arange(2, voc_size, 2) tgt = arange(3, voc_size, 2) batch_size = 128 epochs = 30 dataset = NumberLoader(inp, tgt) train_len = int(len(dataset) * 0.9) val_len = len(dataset) - train_len train_set, val_set = random_split(dataset, [train_len, val_len]) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=1) model = TransformerModel(voc_size, voc_size, hidden=hidden, nlayers=nlayers) if model_name is not None: model.load_state_dict(load(model_name)) model = model.cuda() # optimizer = optim.SGD(model.parameters(), lr=0.5) optimizer = optim.Adam(model.parameters()) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) criterion = nn.CrossEntropyLoss() best_loss = 100 for i in range(epochs): epoch_loss = train(model, criterion, optimizer, train_loader) epoch_loss_val = validation(model, criterion, val_loader) # scheduler.step() print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss_val < best_loss: best_loss = epoch_loss_val model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val) save(model.state_dict(), model_name) return model_name
ninput = M + n_meds emsize = 512 # embedding dimension nhid = 2048 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 8 # the number of heads in the multiheadattention models dropout = 0.3 sequence_len = 42 # 1 week, 4-hr average n_mc_smps = 20 model = TransformerModel(M=M, n_meds=n_meds, n_covs=n_covs, sequence_len=sequence_len, emsize=emsize, nhead=nhead, nhid=nhid, nlayers=nlayers, n_mc_smps=n_mc_smps, dropout=dropout).to(globals.device) print("data fully setup!") ### Training parameters criterion = nn.BCEWithLogitsLoss(reduction='sum') lr = 0.03 optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) ### Training best_val_loss = float("inf")
train_on_gpu = False vocab = pickle.load( open("models/transformer/vocab_siamzone-v4-space.pkl", "rb")) vocab_to_int = vocab["vocab_to_int"] int_to_vocab = vocab["int_to_vocab"] ntokens = len(vocab_to_int) emsize = 512 nhid = 512 nlayers = 4 nhead = 4 dropout = 0.2 model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) model_save_path = "./models/transformer/lm-siamzone-v4-space-342.pkl" model.load_state_dict( torch.load(model_save_path, map_location=torch.device("cpu"))) model.eval() print("Model initialized") def top_k_top_p_filtering(logits, top_k, top_p, temperature, filter_value=-float("Inf")): # Hugging Face script to apply top k and nucleus sampling
from data_load import vocab, train_data, get_batch, bptt, val_data from model import TransformerModel import torch.nn as nn import torch import math from tqdm import tqdm ntokens = len(vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) import time def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() src_mask = model.generate_square_subsequent_mask(bptt).to(device) for batch, i in tqdm(enumerate(range(0, train_data.size(0) - 1, bptt))):
return j glove_embed = open_it(params.glove_embed) ########## Load dataset ############# dataset_object = wtwtDataset() train_dataset = dataset_object.train_dataset eval_dataset = dataset_object.eval_dataset if params.dummy_run: eval_dataset = train_dataset target_names = [] else: eval_dataset = dataset_object.eval_dataset target_names = [dataset_object.id2stance[id_] for id_ in range(0, 4)] ########## Create model ############# model = TransformerModel(glove_embed, params.glove_dims, params.trans_ip_dims, params.num_heads, params.trans_ff_hidden, params.num_layers, params.mlp_hidden, params.dropout) model = model.to(params.device) print("Detected", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) if params.wandb: wandb.watch(model) ########## Optimizer & Loss ########### def my_fancy_optimizer(warmup_proportion=0.1): num_train_optimization_steps = len(train_dataset) * params.n_epochs param_optimizer = list(model.parameters()) # param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M") eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/'+str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None']*len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data]+semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args,eos=eos_setting) ## to load pretrained model nsml.load(checkpoint='best', session='t0005/rush1-2/79') #print(tokenizer.vocab) if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
shuffle=True, collate_fn=collate_trainval, ) ######################################################### ############### model, optimizer ######################## print("loading model and optimizer...") if torch.cuda.is_available(): device = torch.device("cuda") print("using GPU numbers {}".format(CONFIG.hyperparam.misc.gpu_ids)) else: device = torch.device("cpu") print("using CPU") model = TransformerModel( CONFIG, vocab_size=len(tokenizer), bos_idx=tokenizer.bos_idx, pad_idx=tokenizer.pad_idx, ) model = model.to(device) if CONFIG.hyperparam.optimization.name == "Adam": optimizer = optim.Adam( model.parameters(), lr=CONFIG.hyperparam.optimization.lr, betas=( CONFIG.hyperparam.optimization.beta1, CONFIG.hyperparam.optimization.beta2, ), weight_decay=CONFIG.hyperparam.optimization.weight_decay, ) else: raise NotImplementedError("only Adam implemented")
init_token='<sos>', eos_token='<eos>', lower=True) train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits( TEXT, root='datas', train='wiki.train.tokens', validation='wiki.valid.tokens', test='wiki.test.tokens') # 依据训练集构建词典 TEXT.build_vocab(train_txt) model = TransformerModel(len(TEXT.vocab.stoi), ninp=200, nhead=2, nhid=200, nlayers=2, dropout=0.2).to(device) # 模型加载训练好的参数 # checkpoint = torch.load('datasets/models/best_model.pth.tar') checkpoint = torch.load('temp/models/best_model.pth.tar') model.load_state_dict(checkpoint['state_dict']) # 已知序列 history = 'it seems' h = [] for w in history.split(): h.append([TEXT.vocab.stoi[w]]) while (True): # 把列表转化成 tensor ,然后计算模型输出
def main(args): random_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda") device = torch.device("cuda" if args.cuda else "cpu") corpus = data.Corpus(args.data) ntokens = len(corpus.dictionary) print('loaded dictionary') if args.model == 'Transformer': model = TransformerModel( ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = RNNModel( args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model_state_dict']) model.eval() print('loaded model') is_transformer_model = hasattr( model, 'model_type') and model.model_type == 'Transformer' if not is_transformer_model: hidden = model.init_hidden(1) input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device) with open(args.outf, 'w') as outf: with torch.no_grad(): # no tracking history for i in range(args.words): if is_transformer_model: output = model(input, False) word_weights = output[-1].squeeze().div( args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] word_tensor = torch.Tensor([[word_idx]]).long().to(device) input = torch.cat([input, word_tensor], 0) else: output, hidden = model(input, hidden) word_weights = output.squeeze().div(args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] input.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx] outf.write(word + ('\n' if i % 20 == 19 else ' ')) if i % args.log_interval == 0: print('| Generated {}/{} words'.format(i, args.words))
def train(args, logger, model_save_dir): # set seed torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.infre: pretrain_embed = pickle.load( open('../embed_infre/{}'.format(args.embed), 'rb')) train_dataset = pickle.load(open('../data/train.infre.pkl', 'rb')) else: pretrain_embed = pickle.load( open('../embed/{}'.format(args.embed), 'rb')) train_dataset = pickle.load(open('../data/train.pkl', 'rb')) try: pretrain_embed = torch.from_numpy(pretrain_embed).float() except: pretrain_embed = pretrain_embed.float() train_dataset = ProbingListMaxDataset(train_dataset) dataLoader = DataLoader(train_dataset, batch_size=args.batch_sz, shuffle=True) if args.model == 'BiLSTM': model = ListMax(args.hidden_dim, pretrain_embed) elif args.model == 'CNN': model = CNN(pretrained=pretrain_embed) else: model = TransformerModel(pretrained=pretrain_embed, nhead=5, nhid=50, nlayers=2) # model = ListMaxTransformer(args.hidden_dim, pretrain_embed) if torch.cuda.is_available(): model.cuda() criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_dev_acc = 0 best_dev_model = None best_dev_test_acc = 0 counter = 0 for epoch in range(1, args.n_epoch + 1): train_loss = 0 train_acc = 0 model.train() iteration = 0 for batch in dataLoader: optimizer.zero_grad() x = torch.stack(batch['input']) # 5 x bz y = batch['label'] # bz if torch.cuda.is_available(): x = x.cuda() y = y.cuda() output = model(x) loss = criterion(output, y) train_loss += loss.item() loss.backward() optimizer.step() train_acc += (output.argmax(1) == y).sum().item() iteration += 1 # if iteration % args.iter_print == 0: # logger.info('{}-{}-{}-{}'.format(epoch, iteration, train_loss, train_acc)) train_loss = train_loss / len(train_dataset) train_acc = train_acc / len(train_dataset) dev_loss, dev_acc = val(model, mode='dev') test_loss, test_acc = val(model, mode='test') if dev_acc > best_dev_acc: best_dev_model = model.state_dict().copy() best_dev_acc = dev_acc best_dev_test_acc = test_acc counter = 0 else: counter += 1 logger.info('TRAIN: epoch:{}-loss:{}-acc:{}'.format( epoch, train_loss, train_acc)) logger.info('DEV: epoch:{}-loss:{}-acc:{}'.format( epoch, dev_loss, dev_acc)) logger.info('TEST: epoch:{}-loss:{}-acc:{}'.format( epoch, test_loss, test_acc)) logger.info('BEST-DEV-ACC: {}, BEST-DEV-TEST-ACC:{}'.format( best_dev_acc, best_dev_test_acc)) # # if counter > 30: # break torch.save( best_dev_model, model_save_dir + '/model-{}-{}.pt'.format(best_dev_test_acc, args.lr))