def train(args): config = load_config(args.model_dir) train_dataset = LMDataset(config["train_file"], vocab_file=config["vocab_file"]) vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") with open(vocab_dump_path, 'wb') as fp: pickle.dump(train_dataset.vocab, fp) valid_dataset = LMDataset(config["valid_file"], vocab_dump=vocab_dump_path) config["vocab_size"] = len(train_dataset.vocab) model = LM(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) model.load_model(args.model_dir, args.epoch) model.train(epochs=config["train_epochs"], batch_size=config["batch_size"], data_engine=train_dataset, valid_data_engine=valid_dataset, train_decoder_epochs=config.get("train_decoder_epochs", 0), max_iter_per_epoch=config.get("max_iter_per_epoch", 100000))
def k_experiment(file, n): output = '' lmd = LMDataset(file) split = int(len(lmd.raw_data)*0.8) train, test = lmd.raw_data[:split], lmd.raw_data[split:] X = [] Y = [] for k in np.arange(0.1, 1.1, step=0.1): lm = NgramLM(data=train, n=n) lm.train() tune = lm.generate() acc, pp = lm.test(data=test, s='add-k', k=k) X.append(k) Y.append(pp) op = 'n: {}\nk: {}\n{}\nAccuracy: {}\nPerplexity: {}\n\n'.format( str(n), str(k), tune, str(acc), str(pp)) print(op) output += op with open('data/lm_k_exp', 'w') as f: f.write(output) plt.plot(X, Y, '-') plt.xlabel('k of add-k smoothing') plt.ylabel('Perplexity') plt.show()
def evaluate(model, sentences, vocab, reverse_vocab, hy, writer, device): dataset = LMDataset(sentences, vocab, reverse_vocab, hy.window_size) loader = DataLoader(dataset, batch_size=hy.batch_size, shuffle=True, drop_last=True) vocab_size = len(vocab.keys()) print("Loaded vocab of size {} for evaluation".format(vocab_size)) perplexity = compute_model_accuracy(model, loader, device, writer) return perplexity
def ngram_lm(file, n, gen, test, smoothing, k): lmd = LMDataset(file) split = int(len(lmd.raw_data) * 0.8) train, test = lmd.raw_data[:split], lmd.raw_data[split:] lm = NgramLM(data=train, n=n) lm.train() if gen: print 'Generated Tune: \n{}'.format(lm.generate()) if test: acc, pp = lm.test(data=test, s=smoothing, k=k) print 'Accuracy: {}\nPerplexity: {}'.format(acc, pp)
def train(args): args.save_dir += "_" + args.model_type + "_lm" if not args.seq2seq else "_seq2seq" os.makedirs(args.save_dir, exist_ok=True) device = "cuda" if torch.cuda.is_available() else "cpu" if args.model_type == "lstm": from lstm import LMModel, Seq2SeqModel elif args.model_type == "transformer": from transformer import LMModel, Seq2SeqModel if args.seq2seq: train_set = Seq2SeqDataset(device=device) valid_set = Seq2SeqDataset(split="valid", device=device) model = Seq2SeqModel(args, train_set.dictionary).to(device) else: train_set = LMDataset(device=device) valid_set = LMDataset(split="valid", device=device) model = LMModel(args, train_set.dictionary).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) train_loader = DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn, shuffle=True) evaluate(model, valid_set) for epoch in range(args.num_epoch): model.train() with tqdm(train_loader, desc="training") as pbar: losses = [] for samples in pbar: optimizer.zero_grad() loss = model.get_loss(**samples) loss.backward() optimizer.step() losses.append(loss.item()) pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr'])) if epoch % args.save_interval == 0: torch.save(model, args.save_dir + "/{}_{}.pt".format(args.model_type, epoch + 1)) evaluate(model,valid_set)
def test(args): config = load_config(args.model_dir) vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") test_file = config["test_file"] if len( args.test_file) == 0 else args.test_file test_dataset = LMDataset(test_file, vocab_dump=vocab_dump_path) config["vocab_size"] = len(test_dataset.vocab) model = LM(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) epoch = model.load_model(args.model_dir, args.epoch) else: print_time_info("Loading last checkpoint from model_dir") epoch = model.load_model(args.model_dir) loss = model.test(batch_size=config["batch_size"], data_engine=test_dataset)
def n_experiment(file, smoothing, k): output = '' lmd = LMDataset(file) split = int(len(lmd.raw_data)*0.8) train, test = lmd.raw_data[:split], lmd.raw_data[split:] X = [] Y = [] Y_acc = [] for n in range(2, 21): lm = NgramLM(data=train, n=n) lm.train() tune = lm.generate() acc, pp = lm.test(data=test, s=smoothing, k=k) X.append(n) Y.append(pp) Y_acc.append(acc) op = 'n: {}\nk: {}\n{}\nAccuracy: {}\nPerplexity: {}\n\n'.format( str(n), str(k), tune, str(acc), str(pp)) print(op) output += op with open('data/lm_n_exp_t', 'w') as f: f.write(output) plt.plot(X, Y, '-') plt.xlabel('n of n-gram model') plt.ylabel('Perplexity') plt.savefig('data/lm_n_exp_pp_t') plt.clf() plt.plot(X, Y_acc, '-') plt.xlabel('n of n-gram model') plt.ylabel('Accuracy') plt.savefig('data/lm_n_exp_acc_t')
def train(args): if args.logdir is None: args.logdir = "Models-{}".format(time.strftime("%Y%m%d-%H%M%S")) task = "lm" if not args.seq2seq else "seq2seq" args.logdir += "_" + args.model_type + "_" + task os.makedirs(args.logdir, exist_ok=True) os.makedirs(os.path.join(args.logdir, "models"), exist_ok=True) print("Experiment dir : {}".format(args.logdir)) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.logdir, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) device = "cuda:" + str(args.gpuid) if torch.cuda.is_available() else "cpu" mem_crammer = [] if args.model_type == "lstm": from lstm import LMModel, Seq2SeqModel elif args.model_type == "transformer": from transformer import LMModel, Seq2SeqModel if args.seq2seq: train_set = Seq2SeqDataset(device=device) valid_set = Seq2SeqDataset(split="valid", device=device) model = Seq2SeqModel(args, train_set.dictionary).to(device) else: train_set = LMDataset(device=device) valid_set = LMDataset(split="valid", device=device) model = LMModel(args, train_set.dictionary).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) warmup_epoch = args.num_epoch * 0.1 scheduler = ExponentialLR(optimizer, 0.1**(1 / (args.num_epoch - warmup_epoch))) iter_per_epoch = (len(train_set) + args.batch_size - 1) // args.batch_size warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warmup_epoch) train_loader = DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn, shuffle=True) bestppl = 1e9 for epoch in range(args.num_epoch): model.train() if args.cram: while True: try: junk = torch.rand((9999, 9999), dtype=float, device=device) except: with torch.cuda.device(device): torch.cuda.empty_cache() break mem_crammer.append(junk) with tqdm(train_loader, desc="training") as pbar: losses = [] for samples in pbar: if epoch < warmup_epoch: warmup_scheduler.step() optimizer.zero_grad() while True: success = True try: loss = model.get_loss(**samples) loss.backward() optimizer.step() except: del mem_crammer[-1] with torch.cuda.device(device): torch.cuda.empty_cache() success = False optimizer.zero_grad() if success: break losses.append(loss.item()) pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr'])) logging.info( "Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr'])) if epoch % args.save_interval == 0: savepath = os.path.join( args.logdir, "models/{}_{}.pt".format(args.model_type, epoch + 1)) torch.save(model, savepath) logging.info("Saving to {}".format(savepath)) if task == "lm": print("好 -->", model.generate("好", beam_size=3, device=device)) print("秋水 -->", model.generate("秋水", beam_size=3, device=device)) print("寒烟翠-->", model.generate("寒烟翠", beam_size=3, device=device)) elif task == "seq2seq": print("改革春风吹满地-->", model.generate("改革春风吹满地", beam_size=2, device=device)) print("牛津大学聪明人不及蟾蜍一半-->", model.generate("牛津大学聪明人不及蟾蜍一半", beam_size=2, device=device)) print("一支穿云箭,青天白日重新现-->", model.generate("一支穿云箭,青天白日重新现", beam_size=2, device=device)) loss, ppl = evaluate(model, valid_set, False) logging.info("Valid, Loss: %0.8f, ppl: %0.8f" % (loss, ppl)) if ppl < bestppl: bestppl = ppl savepath = os.path.join( args.logdir, "models/{}_{}.pt".format(args.model_type, task)) torch.save(model, savepath) logging.info("Best ppl! Saving to {}".format(savepath)) if epoch >= warmup_epoch: scheduler.step()
def tunes_test(file, n): lmd = LMDataset(file, byKey=True) for key, tunes in lmd.key_date.items(): lm = NgramLM(tunes, n) lm.train() print(key, lm.generate())
def tunes_parse(file): lmd = LMDataset(file) return lmd.get_data()
# check cuda if opt.cuda and not torch.cuda.is_available(): raise RuntimeError('Cannot train on GPU because cuda is not available') device = 'cuda' if opt.cuda else 'cpu' torch.manual_seed(opt.seed) if opt.cuda: torch.cuda.manual_seed(opt.seed) # Initialize all except model # vocab = torchtext.vocab.GloVe(name='840B', dim='300', cache='/media/data/nlp/wv/glove') # vocab = pickle.load(open(opt.vocab_path, 'rb')) lmdataset = LMDataset(vocab_path=opt.vocab_path, corpus_path=opt.data_path, bptt=opt.bptt, device=device, min_counts=opt.min_counts) opt.vocab_size = len(lmdataset.vocab) opt.device = device lmloader = DataLoader(lmdataset, batch_size=opt.batch_size, shuffle=False) # prefix is added to model name and to tensorboard scalar name start_time = str(datetime.datetime.now()).replace(' ', '_').replace(':', '_')[:-10] prefix = 'vocab_{}.emb_{}.hidden_{}.lr_{}.start_time_{}'.format( opt.vocab_size, opt.embedding_size, opt.hidden_size, opt.learning_rate, start_time) if opt.tensorboard: from tensorboardX import SummaryWriter
if __name__ == '__main__': parser = argparse.ArgumentParser(description='sampler.py') opts.model_opts(parser) opts.model_io_opts(parser) opts.data_opts(parser) opts.sample_opts(parser) opt = parser.parse_args() if opt.cuda and not torch.cuda.is_available(): raise RuntimeError( 'Cannot sample on GPU because cuda is not available') device = 'cuda' if opt.cuda else 'cpu' model = torch.load(opt.checkpoint) model.device = device model.to(device) lmdataset = LMDataset( vocab_path=opt.vocab_path, corpus_path=opt.data_path, bptt=opt.length, device=device, min_counts=0 # TODO: make sure it works ) sampler = Sampler(model, lmdataset) sampler.sample(opt.batch_size, strategy=opt.sampling_strategy, temperature=opt.temperature, n_sampled=opt.length)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-conf", type=str) parser.add_argument("--debug", action="store_true") parser.add_argument( "--gpu", type=str, default=None, help= "binary flag which gpu to use (For example '10100000' means use device_id=0 and 2)" ) args = parser.parse_args() config = configparser.ConfigParser() config.read(args.conf) hidden_size = int(config["model"]["hidden_size"]) num_hidden_layers = int(config["model"]["num_hidden_layers"]) num_attention_heads = int(config["model"]["num_attention_heads"]) intermediate_size = int(config["model"]["intermediate_size"]) max_position_embeddings = int(config["model"]["max_position_embeddings"]) # vocab_size = int(config["vocab"]["vocab_size"]) mask_id = int(config["vocab"]["mask_id"]) # log_path = config["log"]["log_path"] log_dir = os.path.dirname(log_path) os.makedirs(log_dir, exist_ok=True) log_step = int(config["log"]["log_step"]) # train_size = int(config["data"]["train_size"]) # save_prefix = config["save"]["save_prefix"] save_dir = os.path.dirname(save_prefix) os.makedirs(save_dir, exist_ok=True) save_epoch = int(config["save"]["save_epoch"]) # batch_size = int(config["train"]["batch_size"]) if args.debug: batch_size = 10 num_epochs = int(config["train"]["num_epochs"]) learning_rate = float(config["train"]["learning_rate"]) warmup_proportion = float(config["train"]["warmup_proportion"]) weight_decay = float(config["train"]["weight_decay"]) # num_to_mask = int(config["mask"]["num_to_mask"]) max_seq_len = int(config["mask"]["max_seq_len"]) if args.debug: logging.basicConfig(format="%(asctime)s %(message)s", level=logging.DEBUG) else: logging.basicConfig(filename=log_path, format="%(asctime)s %(message)s", level=logging.DEBUG) bertconfig = modeling_bert.BertConfig( vocab_size_or_config_json_file=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, max_position_embeddings=max_position_embeddings) model = BertForMaskedLM(config=bertconfig) total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) if args.gpu is not None: device_ids = [] for device_id, flag in enumerate(args.gpu): if flag == "1": device_ids.append(device_id) multi_gpu = True device = torch.device("cuda:{}".format(device_ids[0])) else: multi_gpu = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging.info(f"device: {device}") if "model_path" in config["train"]: model_path = config["train"]["model_path"] state_dict = torch.load(model_path, map_location=device) model.load_state_dict(state_dict) logging.info(f"load model from {model_path}") model.to(device) if multi_gpu: logging.info(f"GPU: device_id={device_ids}") model = torch.nn.DataParallel(model, device_ids=device_ids) model.train() # optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = (train_size // batch_size) * num_epochs optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, weight_decay=weight_decay, t_total=t_total) logging.info("start training...") for epoch in range(num_epochs): if "train_dir" in config["data"]: train_dir = config["data"]["train_dir"] datpaths = os.listdir(train_dir) random.shuffle(datpaths) for step_ds, path in enumerate(datpaths): path = os.path.join(train_dir, path) dataset = LMDataset(path) num_steps = (len(dataset) // batch_size) + 1 logging.info(f"dataset from: {path}") loss_ds = train_dataset(dataset=dataset, model=model, optimizer=optimizer, multi_gpu=multi_gpu, device=device, epoch=epoch, batch_size=batch_size, num_steps=num_steps, log_step=log_step, num_to_mask=num_to_mask, mask_id=mask_id, max_seq_len=max_seq_len) logging.info( f"step {step_ds + 1} / {len(datpaths)}: {(loss_ds / num_steps):.6f}" ) else: train_path = config["data"]["train_path"] dataset = LMDataset(train_path) num_steps = (len(dataset) // batch_size) + 1 loss_epoch = train_dataset(dataset=dataset, model=model, optimizer=optimizer, multi_gpu=multi_gpu, device=device, epoch=epoch, batch_size=batch_size, num_steps=num_steps, log_step=log_step, num_to_mask=num_to_mask, mask_id=mask_id, max_seq_len=max_seq_len) logging.info( f"epoch {epoch + 1} / {num_epochs} : {(loss_epoch / num_steps):.6f}" ) if (epoch + 1) % save_epoch == 0: save_path = f"{save_prefix}.network.epoch{(epoch + 1):d}" optimizer_save_path = f"{save_prefix}.optimizer.epoch{(epoch + 1):d}" if multi_gpu: torch.save(model.module.state_dict(), save_path.format(epoch + 1)) else: torch.save(model.state_dict(), save_path.format(epoch + 1)) logging.info(f"model saved: {save_path}") torch.save(optimizer.state_dict(), optimizer_save_path.format(epoch + 1)) logging.info(f"optimizer saved: {optimizer_save_path}")