def main(rnn_type="rnn"): from data import loop_data, build_vocabulary, batchify np.random.seed(11) batch_size = 32 n_steps = 20 lr = 0.01 lr_decay = 0.5 train_text, valid_text = loop_data() vocab, rev_vocab = build_vocabulary(train_text) vocab_size = len(vocab) print "vocab size:", vocab_size model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type) # TODO: sample decoded sentence with tf.Session() as sess: tf.initialize_all_variables().run() prev_epoch_cost = 9999999 # arbitarily large number for epoch in range(5): print "epoch", epoch print "learning rate", lr list_of_costs = [] model.assign_lr(sess, lr) for idx, (x, y) in tqdm( enumerate(batchify(train_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=True)) if idx % 100 == 0: print "cost", 2**np.mean(list_of_costs) list_of_costs = [] epoch_cost = np.mean(list_of_costs) print "train cost", 2**epoch_cost list_of_costs = [] for idx, (x, y) in tqdm( enumerate(batchify(valid_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=False)) epoch_cost = np.mean(list_of_costs) print "valid cost", 2**epoch_cost if epoch_cost > prev_epoch_cost: lr *= lr_decay prev_epoch_cost = epoch_cost
def main(rnn_type="rnn"): from data import loop_data, build_vocabulary, batchify np.random.seed(11) batch_size = 32 n_steps = 20 lr = 0.01 lr_decay = 0.5 train_text, valid_text = loop_data() vocab, rev_vocab = build_vocabulary(train_text) vocab_size = len(vocab) print "vocab size:", vocab_size model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type) # TODO: sample decoded sentence with tf.Session() as sess: tf.initialize_all_variables().run() prev_epoch_cost = 9999999 # arbitarily large number for epoch in range(5): print "epoch", epoch print "learning rate", lr list_of_costs = [] model.assign_lr(sess, lr) for idx, (x, y) in tqdm(enumerate(batchify(train_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=True)) if idx % 100 == 0: print "cost", 2 ** np.mean(list_of_costs) list_of_costs = [] epoch_cost = np.mean(list_of_costs) print "train cost", 2 ** epoch_cost list_of_costs = [] for idx, (x, y) in tqdm(enumerate(batchify(valid_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=False)) epoch_cost = np.mean(list_of_costs) print "valid cost", 2 ** epoch_cost if epoch_cost > prev_epoch_cost: lr *= lr_decay prev_epoch_cost = epoch_cost
def _query_skeletons_to_responses(query, skeletons): all_d = [] for skeleton in skeletons: all_d.append([query, query, skeleton, skeleton]) batch_dict = batchify(all_d, vocab_src, vocab_tgt, set([]), None) hyps_batch = model.work(batch_dict, beam_size, max_time_step) responses = [] for hyps in hyps_batch: hyps.sort(key=lambda x: x.score / ((1 + len(x.seq))**0.6), reverse=True) best_hyp = hyps[0] predicted_tgt = [token.raw for token in best_hyp.seq] predicted_tgt = predicted_tgt[1:-1] response = ''.join(predicted_tgt) responses.append(response) return responses
corpus = data.Corpus(args.data, max_seq_len=args.prefix_len) embeddings = None if not args.glove is None: embeddings = read_glove(args.glove, corpus.dictionary) if not embeddings is None: first_tok = next(iter(embeddings)) if len(embeddings[first_tok]) != args.emsize: print( "ERROR: Embedding size (--emsize) %d is not the same as pre-trained embedding size %d" % (len(embeddings[first_tok]), args.emsize)) sys.exit(-1) eval_batch_size = 100 device = torch.device("cuda" if args.cuda else "cpu") train_data = batchify(corpus.train, args.batch_size, device) val_data = batchify(corpus.valid, eval_batch_size, device) test_data = batchify(corpus.test, eval_batch_size, device) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.load is None: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout,
BPE encoder """ encoder = json.load(open(encoder_path)) encoder['_pad_'] = len(encoder) encoder['_start_'] = len(encoder) encoder['_end_'] = len(encoder) encoder['_unk_'] = len(encoder) n_special = 4 """ DATA """ train, valid, test = get_data(encoder, data_dir, prefix, params.cut_down_len, label_size, params.ratio) max_len = 0. if params.corpus == 'sage': train['text'] = batchify(np.array(train['text'][0]), params.batch_size) valid['text'] = batchify(np.array(valid['text'][0]), params.batch_size) test['text'] = batchify(np.array(test['text'][0]), params.batch_size) """ Params """ if params.init_emb: word_embeddings = np.concatenate( [ np.load(wordvec_path).astype(np.float32), np.zeros((1, params.d_model), np.float32), # pad, zero-value! (np.random.randn(n_special - 1, params.d_model) * 0.02).astype( np.float32) ], 0) else:
def timeSince(since): now = time.time() s = now - since m = math.floor(s / 60) s -= m * 60 return '%dm %ds' % (m, s) if __name__ == "__main__": # prepare data np_data, np_labels, np_vdata, np_vlabels = prepareData() batch_size = args.batch_size #TODO: batchsize and seq_len is the issue to be addressed n_epoches = args.max_epochs batches = batchify(np_data, batch_size, np_labels) vbatches = batchify(np_vdata, batch_size, np_vlabels) device = torch.device("cuda") # setup model from model import RNN, NaiveRNN input_size = 2 hidden_size = args.hidden_size output_size = 2 rnn = RNN(input_size, hidden_size, output_size, batch_size).to(device) #rnn = NaiveRNN(input_size, hidden_size, output_size, batch_size).to(device) # define loss criterion = nn.NLLLoss(reduction='none')
output, hidden = model(input, hidden) output = output.squeeze() output = softmax(output, dim=0) p = output[current_idx].data # 概率 total_p += math.log(p) #e为底 return math.exp(-total_p * (1 / sentence_len)) def evaluate(model, test_dataset, dict): ppl = 0 for sentence in test_dataset: ppl += evaluate_iter(model, sentence, dict) ppl = ppl / len(test_dataset) print("evaluation ppl:", ppl) return ppl if __name__ == '__main__': dataset = data.get_dataset(file_path) dict = data.build_dict(dataset) config.vocab_size = len(dict) train_dataset, test_dataset = data.split_data( dataset, train_proportion=config.train_proportion) train_tokens = data.tokenize(train_dataset, dict) model = RNNModel(config) train_batch_source = data.batchify(train_tokens, config.batch_size) #传入batchify好的数据直接训练 train(model, batch_source=train_batch_source) #test evaluate(model, test_dataset, dict)
print "getting data..." corpus = data.Corpus(args.data) eval_batch_size = 10 print "batching..." stops = [ i for i in range(len(corpus.train)) if corpus.train[i] == corpus.dictionary.word2idx["<eos>"] ] last = stops[args.nsentences - 1] corpus.train = corpus.train[:last] train_data = data.batchify(corpus.train, args.batch_size, args.cuda) valid_data = data.batchify(corpus.valid, eval_batch_size, args.cuda) test_data = data.batchify(corpus.test, eval_batch_size, args.cuda) print "getting model..." ntokens = len(corpus.dictionary) lm = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if args.cuda: lm.cuda() criterion = nn.CrossEntropyLoss()
for file in files: prefix = file.split('_')[0] if prefix == 'train': train_files.append(file) if prefix == 'valid': valid_files.append(file) print('Start training!!!') for epoch in range(1, args.epochs+1): valid_fname = random.choice(valid_files) for train_fname in train_files: train_fname = random.choice(train_files) corpus = data.SentenceCorpus(args.bptt, args.lm_data, args.tag_data, word2idx, tag2idx, idx2word, idx2tag, train_fname, valid_fname, None, testflag=args.test) train_lm_data = batchify(corpus.train_lm, args.batch_size) train_masking = batchify(corpus.train_maksing, args.batch_size) train_ccg_data = batchify(corpus.train_tag, args.batch_size) epoch_start_time = time.time() train(args, model, train_lm_data, train_masking, train_ccg_data, criterion, optimizer) val_lm_data = batchify(corpus.valid_lm, args.batch_size) val_masking = batchify(corpus.valid_maksing, args.batch_size) val_ccg_data = batchify(corpus.valid_tag, args.batch_size) val_loss = evaluate(args, model, val_lm_data, val_masking, val_ccg_data) print('-' * 80) print('| end of {} | time: {:5.2f}s | valid loss {:5.4f} '.format(train_fname, (time.time() - epoch_start_time), val_loss)) print('-' * 80) # Save the model if the validation loss is the best we've seen so far.
def run(args): device = torch.device("cuda" if args.cuda else "cpu") dir_path = os.path.dirname(os.path.realpath(__file__)) debug_msg = ( "\n\nFirst download the PTB dataset and dump it to sandbox/data/penn" "\nSee: https://github.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/tree/master/data" ) assert (Path(dir_path) / "data/penn").exists(), debug_msg for f in ["train.txt", "test.txt", "valid.txt"]: assert (Path(dir_path) / f"data/penn/{f}").exists() eval_batch_size = 20 corpus = Corpus("sandbox/data/penn") train_data = batchify(corpus.train, args.batch_size, device) val_data = batchify(corpus.valid, eval_batch_size, device) test_data = batchify(corpus.test, eval_batch_size, device) rev_test_data = batchify( corpus.test[ torch.arange(corpus.test.shape[0] - 1, -1, step=-1).to(corpus.test.device) ], eval_batch_size, device, ) ntokens = len(corpus.dictionary) model = LanguageModel(ntokens, args.hidden_size, args.num_layers).to(device) criterion = nn.CrossEntropyLoss(reduction="sum") # Loop over epochs. lr = args.lr best_val_loss = 1E9 # At any point you can hit Ctrl + C to break out of training early. for epoch in range(0, args.epochs): epoch_start_time = time.time() train_epoch(model, criterion, corpus, train_data, epoch, lr) val_loss, val_entropy = evaluate( model, criterion, corpus, val_data, eval_batch_size ) print("-" * 89) print( "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | " "valid ppl {:8.2f} | valid entropy {:8.2f}".format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_entropy, ) ) print("-" * 89) # Learning rate annealing if epoch >= 19: lr = lr * args.lr_decay print("=" * 89) if val_loss < best_val_loss: torch.save(model.state_dict(), "bayesian_scratch_scaled.pt") # Run on test data. test_loss, test_entropy = evaluate( model, criterion, corpus, test_data, eval_batch_size ) _, rev_test_entropy = evaluate( model, criterion, corpus, rev_test_data, eval_batch_size ) print("=" * 89) print( "| End of training | test loss {:5.2f} | test ppl {:8.2f} |" " test entropy {:8.2f} | delta entropy {:8.2f}".format( test_loss, math.exp(test_loss), test_entropy, rev_test_entropy - test_entropy, ) )
def main(model=None): print(f'readying model & data @ {now()}') data = load_data() if not data: save_data(preprocess()) data = load_data() if not model: if not config.fresh_model: model = load_model() if not model: model = make_model() save_model(model) model = load_model() print('created ',end='') else: print('loaded ',end='') print(f'model: {describe_model(model)}') print(f'total files: {len(data)}, ',end='') data, data_dev = split_dataset(data) if config.batch_size > len(data): config.batch_size = len(data) elif config.batch_size == -1: config.batch_size = len(data_dev) print(f'train: {len(data)}, dev: {len(data_dev)}, batch size: {config.batch_size}') print(f'hm train: {sum(len(datapoint) for datapoint in data)}, ' f'hm dev: {sum(len(datapoint) for datapoint in data_dev)}, ' f'learning rate: {config.learning_rate}, ' f'optimizer: {config.optimizer}, ' f'\ntraining for {config.hm_epochs} epochs.. ',end='\n') one_batch = (config.batch_size == len(data)) or (config.train_combined and config.train_parallel) config.shuffle_epoch &= not one_batch window_slide_multiplier = config.hm_bars_grouped//config.hm_bars_slide if config.ckp_save_epochs == -1: config.ckp_save_epochs = range(config.hm_epochs) data_losss, dev_losss = [], [] if config.initialize_loss: print(f'initializing losses @ {now()}', flush=True) if not one_batch: data_losss.append(dev_loss(model,data)) dev_losss.append(dev_loss(model,data_dev)) print(f'initial losses: {data_losss, dev_losss}') print(f'training started @ {now()}', flush=True) for ep in range(config.hm_epochs): loss = 0 if config.train_parallel and config.train_combined: l, g = process_data_onebatch(model, data) loss += l give_grads(model, g) batch_size = sum(sum(len(inp) * window_slide_multiplier for inp, lbl in datapoint) for datapoint in data) sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model, ep, batch_size=batch_size) else: for i,batch in enumerate(batchify(data)): if config.disp_batches: print(f'\tbatch {i}, {sum(len(datapoint) for datapoint in batch)}', end='', flush=True) batch_size = sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in batch) if config.train_parallel: l,g = process_batch_parallel(model,batch) loss += l give_grads(model,g) elif config.train_combined: loss += process_batch_combined(model, batch) else: for j,datapoint in enumerate(batch): states = None for k,(inp,lbl) in enumerate(datapoint): out, states = respond_to(model, inp, states) states = [state.detach() for state in states] loss += sequence_loss(lbl,out) sgd(model,batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model,ep,batch_size=batch_size) if config.disp_batches: print(f', completed @ {now()}' ,flush=True) loss /= sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in data) data_losss.append(loss) dev_losss.append(dev_loss(model,data_dev)) print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1]}, completed @ {now()}', flush=True) if ep in config.ckp_save_epochs: save_model(model,f'{config.model_save_path}_ckp{ep}') data_losss.append(dev_loss(model,data)) dev_losss.append(dev_loss(model,data_dev)) print(f'final losses: {[data_losss[-1],dev_losss[-1]]}') print(f'training ended @ {now()}', flush=True) plot(data_losss) show() plot(dev_losss) show() if config.overwrite_model or input(f'Save model as {config.model_save_path}? (y/n): ').lower() == 'y': save_model(load_model(),config.model_save_path+'_prev') save_model(model) return model, [data_losss, dev_losss]