def main(args, local_rank): vocabs = dict() vocabs['tok'] = Vocab(args.tok_vocab, 5, [CLS]) vocabs['lem'] = Vocab(args.lem_vocab, 5, [CLS]) vocabs['pos'] = Vocab(args.pos_vocab, 5, [CLS]) vocabs['ner'] = Vocab(args.ner_vocab, 5, [CLS]) vocabs['predictable_concept'] = Vocab(args.predictable_concept_vocab, 10, [DUM, END]) vocabs['concept'] = Vocab(args.concept_vocab, 5, [DUM, END]) vocabs['rel'] = Vocab(args.rel_vocab, 50, [NIL]) vocabs['word_char'] = Vocab(args.word_char_vocab, 100, [CLS, END]) vocabs['concept_char'] = Vocab(args.concept_char_vocab, 100, [CLS, END]) lexical_mapping = LexicalMap(args.lexical_mapping) if args.pretrained_word_embed is not None: vocab, pretrained_embs = load_pretrained_word_embed( args.pretrained_word_embed) vocabs['glove'] = vocab else: pretrained_embs = None for name in vocabs: print((name, vocabs[name].size)) torch.manual_seed(19940117) torch.cuda.manual_seed_all(19940117) random.seed(19940117) device = torch.device('cuda', local_rank) #print(device) #exit() model = Parser(vocabs, args.word_char_dim, args.word_dim, args.pos_dim, args.ner_dim, args.concept_char_dim, args.concept_dim, args.cnn_filters, args.char2word_dim, args.char2concept_dim, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.snt_layers, args.graph_layers, args.inference_layers, args.rel_dim, pretrained_embs, device=device) if args.world_size > 1: torch.manual_seed(19940117 + dist.get_rank()) torch.cuda.manual_seed_all(19940117 + dist.get_rank()) random.seed(19940117 + dist.get_rank()) model = model.cuda(local_rank) train_data = DataLoader(vocabs, lexical_mapping, args.train_data, args.train_batch_size, for_train=True) dev_data = DataLoader(vocabs, lexical_mapping, args.dev_data, args.dev_batch_size, for_train=True) train_data.set_unk_rate(args.unk_rate) weight_decay_params = [] no_weight_decay_params = [] for name, param in model.named_parameters(): if name.endswith('bias') or 'layer_norm' in name: no_weight_decay_params.append(param) else: weight_decay_params.append(param) grouped_params = [{ 'params': weight_decay_params, 'weight_decay': 1e-4 }, { 'params': no_weight_decay_params, 'weight_decay': 0. }] optimizer = AdamWeightDecayOptimizer(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-6) batches_acm, loss_acm, concept_loss_acm, arc_loss_acm, rel_loss_acm = 0, 0, 0, 0, 0 #model.load_state_dict(torch.load('./ckpt/epoch297_batch49999')['model']) discarded_batches_acm = 0 queue = mp.Queue(10) train_data_generator = mp.Process(target=data_proc, args=(train_data, queue)) train_data_generator.start() used_batches = 0 if args.resume_ckpt: ckpt = torch.load(args.resume_ckpt) model.load_state_dict(ckpt['model']) optimizer.load_state_dict(ckpt['optimizer']) batches_acm = ckpt['batches_acm'] del ckpt model.train() epoch = 0 while True: batch = queue.get() #print("epoch",epoch) #print("batches_acm",batches_acm) #print("used_batches",used_batches) if isinstance(batch, str): epoch += 1 print('epoch', epoch, 'done', 'batches', batches_acm) else: batch = move_to_device(batch, model.device) concept_loss, arc_loss, rel_loss = model(batch) loss = (concept_loss + arc_loss + rel_loss) / args.batches_per_update loss_value = loss.item() concept_loss_value = concept_loss.item() arc_loss_value = arc_loss.item() rel_loss_value = rel_loss.item() if batches_acm > args.warmup_steps and arc_loss_value > 5. * ( arc_loss_acm / batches_acm): discarded_batches_acm += 1 print('abnormal', concept_loss.item(), arc_loss.item(), rel_loss.item()) continue loss_acm += loss_value concept_loss_acm += concept_loss_value arc_loss_acm += arc_loss_value rel_loss_acm += rel_loss_value loss.backward() used_batches += 1 if not (used_batches % args.batches_per_update == -1 % args.batches_per_update): continue batches_acm += 1 if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) update_lr(optimizer, args.embed_dim, batches_acm, args.warmup_steps) optimizer.step() optimizer.zero_grad() if args.world_size == 1 or (dist.get_rank() == 0): if batches_acm % args.print_every == -1 % args.print_every: print( 'Train Epoch %d, Batch %d, Discarded Batch %d, conc_loss %.3f, arc_loss %.3f, rel_loss %.3f' % (epoch, batches_acm, discarded_batches_acm, concept_loss_acm / batches_acm, arc_loss_acm / batches_acm, rel_loss_acm / batches_acm)) model.train() if batches_acm % args.eval_every == -1 % args.eval_every: model.eval() torch.save( { 'args': args, 'model': model.state_dict(), 'batches_acm': batches_acm, 'optimizer': optimizer.state_dict() }, '%s/epoch%d_batch%d' % (args.ckpt, epoch, batches_acm)) model.train()
pickle.dump(self.sequence_autoencoder.to_json(), file) with open(hyper_params_file, mode='wb') as file: pickle.dump(self.get_hyper_params(), file) def predict_sentences(predictions, vocab): return [ " ".join(vocab.IdToWord(i) for i in prediction) for prediction in predictions ] if __name__ == '__main__': vocab_file = '../vocab/vocab' tokenizer_file = '../tokenizer/src_tokenizer' vocab = Vocab(vocab_file, 100000) tokenizer = Tokenizer(vocab) with open(tokenizer_file, mode='wb') as file: pickle.dump(tokenizer, file) max_sequence_len = 10 batch_size = 4 p = Preprocessor(batch_size, 'data/sentences.txt', tokenizer, max_sequence_len) embedding_dim = 50 hidden_dim = 100 ae = AutoEncoder(max_sequence_len, vocab.NumIds(), embedding_dim, hidden_dim) ae.build_models() reducelr_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
def main(unused_argv): FLAGS.sticker_path = os.path.join(FLAGS.base_path, FLAGS.sticker_path) FLAGS.data_path = os.path.join(FLAGS.base_path, FLAGS.data_path) FLAGS.test_path = os.path.join(FLAGS.base_path, FLAGS.test_path) FLAGS.vocab_path = os.path.join(FLAGS.base_path, FLAGS.vocab_path) FLAGS.emoji_vocab_path = os.path.join(FLAGS.base_path, FLAGS.emoji_vocab_path) FLAGS.inception_ckpt = os.path.join(FLAGS.base_path, FLAGS.inception_ckpt) if 'decode' in FLAGS.mode: FLAGS.single_pass = True FLAGS.batch_size = 4 FLAGS.dataset_size = -1 vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary emoji_vocab = Vocab(FLAGS.emoji_vocab_path, FLAGS.emoji_vocab_size) # create a vocabulary if 'decode' in FLAGS.mode: batcher = Batcher(FLAGS.test_path, vocab, emoji_vocab, single_pass=FLAGS.single_pass) else: batcher = Batcher(FLAGS.data_path, vocab, emoji_vocab, single_pass=FLAGS.single_pass) if 'decode' in FLAGS.mode: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import logging log = logging.getLogger('tensorflow') log.setLevel(logging.FATAL) for h in log.handlers: log.removeHandler(h) log.addHandler(logging.NullHandler()) # GPU tricks if FLAGS.device is None: index_of_gpu = get_available_gpu() if index_of_gpu < 0: index_of_gpu = '' FLAGS.device = index_of_gpu tf.logging.info(bcolors.OKGREEN + 'using {}'.format(FLAGS.device) + bcolors.ENDC) else: index_of_gpu = FLAGS.device os.environ["CUDA_VISIBLE_DEVICES"] = str(index_of_gpu) tf.set_random_seed(5683) # a seed value for randomness if len(unused_argv) != 1: raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting sticker classification in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) # If single_pass=True, check we're in decode mode if FLAGS.single_pass and 'decode' not in FLAGS.mode: raise Exception("The single_pass flag should only be True in decode mode") ###################### # save parameters and python script ###################### export_json = {} for key, val in FLAGS.__flags.items(): val = val._value export_json[key] = val # save parameters tf.logging.info('saving parameters') current_time_str = datetime.now().strftime('%m-%d-%H-%M') json_para_file = open(os.path.join(FLAGS.log_root, 'flags-' + current_time_str + '-' + FLAGS.mode + '.json'), 'w') json_para_file.write(json.dumps(export_json, indent=4) + '\n') json_para_file.close() # save python source code FLAGS.current_source_code_zip = os.path.abspath(os.path.join(FLAGS.log_root, 'source_code_bak-' + current_time_str + '-' + FLAGS.mode + '.zip')) tf.logging.info('saving source code: %s', FLAGS.current_source_code_zip) python_list = glob.glob('./*.py') zip_file = zipfile.ZipFile(FLAGS.current_source_code_zip, 'w') for d in python_list: zip_file.write(d) for d in glob.glob('slim/*.py'): zip_file.write(d) for d in glob.glob('models/*.py'): zip_file.write(d) zip_file.close() tf.set_random_seed(111) # a seed value for randomness if FLAGS.mode == 'train': tf.logging.info("creating model...") model = StickerClassify() setup_training(model, batcher, emoji_vocab) elif FLAGS.mode == 'decode': tf.logging.info("creating model...") model = StickerClassify() run_test(model, batcher, emoji_vocab) else: raise ValueError("The 'mode' flag must be one of train/eval/decode/auto_decode")
if __name__ == '__main__': config = Config( batch_size = 1, char_embed_size = 25, conv_kernel = 3, depth = 1, dropout = 0.5, h_size = 256, learning_rate = 0.01, pool_size = 53 ) vocab = Vocab(f'etc/samnorsk.300.skipgram.bin', 'etc/gazetteer.txt') output_types = (tf.float32, tf.float32, tf.float32, tf.int32, tf.int32, tf.int32, tf.int32) output_shapes = ( [None, vocab.n_words], # Word embeddings for each word in the sentence [None, vocab.n_pos], # one_hot encoded PoS for each word [None, vocab.n_categories], # NE category memberships [None, None], # The characters for each word [None], # The number of characters pr word [None], # The labels for each word [] # the number of words in sentence ) examples = tf.data.Dataset.from_generator( vocab.examples(sys.argv[2]),
scripts_to_save=['main.py', 'model.py', 'nn_utils.py']) def logging(s, print_=True, log_=True): if print_: print(s) if log_: with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log: f_log.write(s + '\n') logging('Args') for k, v in args.__dict__.items(): logging(' - {} : {}'.format(k, v)) vocab = Vocab('vocabv2.pkl', args.ntokens, '<unk>') def get_file_list(filename): return [line.strip() for line in open(filename)] def get_tensors(filenames, cache_file=None): if cache_file is not None and os.path.exists(cache_file): return pickle.load(open(cache_file, 'rb')) ret = [] for filename in filenames: ret.extend(vocab.parse_file(filename)) if cache_file is not None: pickle.dump(ret, open(cache_file, 'wb')) return ret
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'both': FLAGS.exp_name = FLAGS.exp_name + '_both' exp_name = _exp_name + '_both' dataset_articles = _dataset_articles else: FLAGS.exp_name = FLAGS.exp_name + '_singles' exp_name = _exp_name + '_singles' dataset_articles = _dataset_articles + '_singles' my_log_dir = os.path.join(log_dir, FLAGS.ssi_exp_name) print('Running statistics on %s' % FLAGS.exp_name) if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if not os.path.exists(os.path.join( FLAGS.data_root, FLAGS.dataset_name)) or len( os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: print(('No TF example data found at %s so creating it from raw data.' % os.path.join(FLAGS.data_root, FLAGS.dataset_name))) convert_data.process_dataset(FLAGS.dataset_name) logging.set_verbosity( logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen', 'lambdamart_input' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", list(hps_dict.keys()))(**hps_dict) tf.set_random_seed(113) # a seed value for randomness decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) with open(os.path.join(my_log_dir, 'ssi.pkl')) as f: ssi_list = pickle.load(f) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) # batcher = Batcher(None, vocab, hps, single_pass=FLAGS.single_pass) model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, None, vocab) decoder.decode_iteratively(example_generator, total, names_to_types, ssi_list, hps) a = 0
def main(unused_argv): if len( unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if not os.path.exists( os.path.join(FLAGS.data_root, FLAGS.dataset_name)) or len( os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: print( 'No TF example data found at %s so creating it from raw data.' % os.path.join( FLAGS.data_root, FLAGS.dataset_name)) convert_data.process_dataset(FLAGS.dataset_name) logging.set_verbosity(logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key, val in FLAGS.__flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) if FLAGS.pg_mmr or FLAGS.pg_mmr_sim or FLAGS.pg_mmr_diff: # Fit the TFIDF vectorizer if not already fitted if FLAGS.importance_fn == 'tfidf': tfidf_model_path = os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer', FLAGS.dataset_name + '.dill') if not os.path.exists(tfidf_model_path): print( 'No TFIDF vectorizer model file found at %s, so fitting the model now.' % tfidf_model_path) tfidf_vectorizer = fit_tfidf_vectorizer(hps, vocab) with open(tfidf_model_path, 'wb') as f: dill.dump(tfidf_vectorizer, f) # Train the SVR model on the CNN validation set if not already trained if FLAGS.importance_fn == 'svr': save_path = os.path.join(FLAGS.data_root, 'svr_training_data') importance_model_path = os.path.join(FLAGS.actual_log_root, 'svr.pickle') dataset_split = 'val' if not os.path.exists(importance_model_path): if not os.path.exists(save_path) or len( os.listdir(save_path)) == 0: print( 'No importance_feature instances found at %s so creating it from raw data.' % save_path) decode_model_hps = hps._replace( max_dec_steps=1, batch_size=100, mode='calc_features') # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries cnn_dm_train_data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, dataset_split + '*') batcher = Batcher(cnn_dm_train_data_path, vocab, decode_model_hps, single_pass=FLAGS.single_pass, cnn_500_dm_500=False) calc_features(cnn_dm_train_data_path, decode_model_hps, vocab, batcher, save_path) print( 'No importance_feature SVR model found at %s so training it now.' % importance_model_path) features_list = importance_features.get_features_list(True) sent_reps = importance_features.load_data( os.path.join(save_path, dataset_split + '*'), -1) print 'Loaded %d sentences representations' % len(sent_reps) x_y = importance_features.features_to_array(sent_reps, features_list) train_x, train_y = x_y[:, :-1], x_y[:, -1] svr_model = importance_features.run_training(train_x, train_y) with open(importance_model_path, 'wb') as f: cPickle.dump(svr_model, f) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness # Start decoding on multi-document inputs if hps.mode == 'decode': decode_model_hps = hps._replace( max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if not os.path.exists(os.path.join( FLAGS.data_root, FLAGS.dataset_name)) or len( os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: raise Exception('No TF example data found at %s.' % os.path.join(FLAGS.data_root, FLAGS.dataset_name)) if FLAGS.singles_and_pairs == 'both': FLAGS.exp_name = FLAGS.exp_name + '_both' elif FLAGS.singles_and_pairs == 'singles': FLAGS.exp_name = FLAGS.exp_name + '_singles' logging.set_verbosity( logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) print(util.bcolors.OKGREEN + "Experiment path: " + FLAGS.log_root + util.bcolors.ENDC) if FLAGS.dataset_name == 'duc_2004': vocab = Vocab(FLAGS.vocab_path + '_' + 'cnn_dm', FLAGS.vocab_size) # create a vocabulary else: vocab = Vocab(FLAGS.vocab_path + '_' + FLAGS.dataset_name, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ item for item in list(FLAGS.flag_values_dict().keys()) if item != '?' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", list(hps_dict.keys()))(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(113) # a seed value for randomness # Start decoding if hps.mode == 'train': print("creating model...") model = SummarizationModel(hps, vocab) setup_training(model, batcher) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode == 'decode': decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
from flask import Flask, jsonify, request from flask import render_template import config from data import Vocab from predict import build_batch_by_article from predict import BeamSearch model_path = "./logs/weibo_adagrad/train_20201204_215649/model/zh45000" vocab = Vocab("./dataset/finished_files/vocab", config.vocab_size) envocab = Vocab("./dataset/envocab", config.vocab_size) beam_processor = BeamSearch(model_path, vocab) # 注意需要选词汇表进行中英文摘要 def ptrnet_predict(text): return "ptrnet" app = Flask( __name__, static_folder='assets', ) @app.route('/ptrnet', methods=['POST']) def pnpredict(): if request.method == 'POST': plaintext = request.get_json()['text'] # print(plaintext) try: batch = build_batch_by_article(plaintext, vocab) summary = beam_processor.decode(batch)
class Seq2Seq(object): def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99): """Calculate the running average loss via exponential decay. This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve. Args: loss: loss on the most recent eval step running_avg_loss: running_avg_loss so far summary_writer: FileWriter object to write for tensorboard step: training iteration step decay: rate of exponential decay, a float between 0 and 1. Larger is smoother. Returns: running_avg_loss: new running average loss """ if running_avg_loss == 0: # on the first iteration just take the loss running_avg_loss = loss else: running_avg_loss = running_avg_loss * decay + (1 - decay) * loss running_avg_loss = min(running_avg_loss, 12) # clip loss_sum = tf.Summary() tag_name = 'running_avg_loss/decay=%f' % (decay) loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss) self.summary_writer.add_summary(loss_sum, step) tf.logging.info('running_avg_loss: %f', running_avg_loss) return running_avg_loss def restore_best_model(self): """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory""" tf.logging.info("Restoring bestmodel for training...") # Initialize all vars in the model sess = tf.Session(config=util.get_config()) print "Initializing all variables..." sess.run(tf.initialize_all_variables()) # Restore the best model from eval dir saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name]) print "Restoring all non-adagrad variables from best model in eval dir..." curr_ckpt = util.load_ckpt(saver, sess, "eval") print "Restored %s." % curr_ckpt # Save this model to train dir and quit new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model") new_fname = os.path.join(FLAGS.log_root, "train", new_model_name) print "Saving model to %s..." % (new_fname) new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables new_saver.save(sess, new_fname) print "Saved." exit() def restore_best_eval_model(self): # load best evaluation loss so far best_loss = None best_step = None # goes through all event files and select the best loss achieved and return it event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root))) for ef in event_files: try: for e in tf.train.summary_iterator(ef): for v in e.summary.value: step = e.step if 'running_avg_loss/decay' in v.tag: running_avg_loss = v.simple_value if best_loss is None or running_avg_loss < best_loss: best_loss = running_avg_loss best_step = step except: continue tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step)) return best_loss def convert_to_coverage_model(self): """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint""" tf.logging.info("converting non-coverage model to coverage model..") # initialize an entire coverage model from scratch sess = tf.Session(config=util.get_config()) print "initializing everything..." sess.run(tf.global_variables_initializer()) # load all non-coverage weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name]) print "restoring non-coverage variables..." curr_ckpt = util.load_ckpt(saver, sess) print "restored." # save this model and quit new_fname = curr_ckpt + '_cov_init' print "saving model to %s..." % (new_fname) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print "saved." exit() def convert_to_reinforce_model(self): """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint""" tf.logging.info("converting non-reinforce model to reinforce model..") # initialize an entire reinforce model from scratch sess = tf.Session(config=util.get_config()) print "initializing everything..." sess.run(tf.global_variables_initializer()) # load all non-reinforce weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name]) print "restoring non-reinforce variables..." curr_ckpt = util.load_ckpt(saver, sess) print "restored." # save this model and quit new_fname = curr_ckpt + '_rl_init' print "saving model to %s..." % (new_fname) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print "saved." exit() def setup_training(self): """Does setup before starting training (run_training)""" train_dir = os.path.join(FLAGS.log_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if FLAGS.ac_training: dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train") if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir) #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl") #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir) self.model.build_graph() # build the graph if FLAGS.convert_to_reinforce_model: assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True" self.convert_to_reinforce_model() if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" self.convert_to_coverage_model() if FLAGS.restore_best_model: self.restore_best_model() saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time # Loads pre-trained word-embedding. By default the model learns the embedding. if FLAGS.embedding: self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) word_vector = self.vocab.getWordEmbedding() self.sv = tf.train.Supervisor(logdir=train_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.model.global_step, init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None ) self.summary_writer = self.sv.summary_writer self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config()) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() # We create a separate graph for DDQN self.dqn_graph = tf.Graph() with self.dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir, is_chief=True, saver=dqn_saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.dqn.global_step, ) self.dqn_summary_writer = self.dqn_sv.summary_writer self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config()) ''' #### TODO: try loading a previously saved replay buffer # right now this doesn't work due to running DQN on a thread if os.path.exists(replaybuffer_pcl_path): tf.logging.info('Loading Replay Buffer...') try: self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb")) tf.logging.info('Replay Buffer loaded...') except: tf.logging.info('Couldn\'t load Replay Buffer file...') self.replay_buffer = ReplayBuffer(self.dqn_hps) else: self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1)) ''' self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Preparing or waiting for session...") tf.logging.info("Created session.") try: self.run_training() # this is an infinite loop until interrupted except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() if FLAGS.ac_training: self.dqn_sv.stop() def run_training(self): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" tf.logging.info("Starting run_training") if FLAGS.debug: # start the tensorflow debugger self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.train_step = 0 if FLAGS.ac_training: # DDQN training is done asynchronously along with model training tf.logging.info('Starting DQN training thread...') self.dqn_train_step = 0 self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() watcher = Thread(target=self.watch_threads) watcher.daemon = True watcher.start() # starting the main thread tf.logging.info('Starting Seq2Seq training...') while True: # repeats until interrupted batch = self.batcher.next_batch() t0=time.time() if FLAGS.ac_training: # For DDQN, we first collect the model output to calculate the reward and Q-estimates # Then we fix the estimation either using our target network or using the true Q-values # This process will usually take time and we are working on improving it. transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q-values collection time: {}'.format(time.time()-t0)) # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph with self.dqn_graph.as_default(): batch_len = len(transitions) # we use current decoder state to predict q_estimates, use_state_prime = False b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs) # we also get the next decoder state to correct the estimation, use_state_prime = True b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) # use current DQN to estimate values from current decoder state dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] #dqn_q_estimate_loss = dqn_results['loss'] # use target DQN to estimate values for the next decoder state dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov # we use the q_estimate of UNK token for all the OOV tokens q_estimates = np.concatenate([q_estimates, np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1) # modify Q-estimates using the result collected from current and target DQN. # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] # use scheduled sampling to whether use true Q-values or DDQN estimation if FLAGS.dqn_scheduled_sampling: q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DDQN based on true Q-values, # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) # Once we are done with modifying Q-values, we can use them to train the DDQN model. # In this paper, we use a priority experience buffer which always selects states with higher quality # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer. # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training # are full, the DDQN will start the training. self.replay_buffer.add(transitions) # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for # DDQN pre-training if FLAGS.dqn_pretrain: tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...') continue # if not, use the q_estimation to update the loss. results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates) else: results = self.model.run_train_steps(self.sess, batch, self.train_step) t1=time.time() # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer self.train_step = results['global_step'] # we need this to update our running average loss tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0)) printer_helper = {} printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['sampled_r'] - printer_helper['greedy_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) tf.logging.info('-------------------------------------------') self.summary_writer.add_summary(summaries, self.train_step) # write the summaries if self.train_step % 100 == 0: # flush the summary writer every so often self.summary_writer.flush() if FLAGS.ac_training: self.dqn_summary_writer.flush() if self.train_step > FLAGS.max_iter: break def dqn_training(self): """ training the DDQN network.""" try: while True: if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit() _t = time.time() self.avg_dqn_loss = [] avg_dqn_target_loss = [] # Get a batch of size dqn_batch_size from replay buffer to train the model dqn_batch = self.replay_buffer.next_batch() if dqn_batch is None: tf.logging.info('replay buffer not loaded enough yet...') time.sleep(60) continue # Run train step for Current DQN model and collect the results dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch) # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True) self.dqn_train_step = dqn_results['global_step'] self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries self.avg_dqn_loss.append(dqn_results['loss']) avg_dqn_target_loss.append(dqn_target_results['loss']) self.dqn_train_step = self.dqn_train_step + 1 tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t)) # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL with self.dqn_graph.as_default(): current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss))) tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss))) # sleeping is required if you want the keyboard interuption to work time.sleep(FLAGS.dqn_sleep_time) except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() self.dqn_sv.stop() def watch_threads(self): """Watch example queue and batch queue threads and restart if dead.""" while True: time.sleep(60) if not self.thrd_dqn_training.is_alive(): # if the thread is dead tf.logging.error('Found DQN Learning thread dead. Restarting.') self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() def run_eval(self): """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" self.model.build_graph() # build the graph saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time sess = tf.Session(config=util.get_config()) if FLAGS.embedding: sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector}) eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved summary_writer = tf.summary.FileWriter(eval_dir) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() dqn_graph = tf.Graph() with dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time dqn_sess = tf.Session(config=util.get_config()) dqn_train_step = 0 replay_buffer = ReplayBuffer(self.dqn_hps) running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping best_loss = self.restore_best_eval_model() # will hold the best loss achieved so far train_step = 0 while True: _ = util.load_ckpt(saver, sess) # load a new checkpoint if FLAGS.ac_training: _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint processed_batch = 0 avg_losses = [] # evaluate for 100 * batch_size before comparing the loss # we do this due to memory constraint, best to run eval on different machines with large batch size while processed_batch < 100*FLAGS.batch_size: processed_batch += FLAGS.batch_size batch = self.batcher.next_batch() # get the next batch if FLAGS.ac_training: t0 = time.time() transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q values collection time: {}'.format(time.time()-t0)) with dqn_graph.as_default(): # if using true Q-value to train DQN network, # we do this as the pre-training for the DQN network to get better estimates batch_len = len(transitions) b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] tf.logging.info('running test step on dqn_target') dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1) tf.logging.info('fixing the action q-estimates') for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] if FLAGS.dqn_scheduled_sampling: tf.logging.info('scheduled sampling on q-estimates') q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DQN based on true Q-values # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step, q_estimates) t1=time.time() else: tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step) t1=time.time() tf.logging.info('experiment: {}'.format(FLAGS.exp_name)) tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0)) printer_helper = {} loss = printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: loss = printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) # add summaries summaries = results['summaries'] train_step = results['global_step'] summary_writer.add_summary(summaries, train_step) # calculate running avg loss avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, summary_writer, train_step)) tf.logging.info('-------------------------------------------') running_avg_loss = np.mean(avg_losses) tf.logging.info('==========================================') tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss)) tf.logging.info('==========================================') # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best') best_loss = running_avg_loss # flush the summary writer every so often if train_step % 100 == 0: summary_writer.flush() #time.sleep(600) # run eval every 10 minute def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) fw = open('{}/config.txt'.format(FLAGS.log_root),'w') for k,v in flags.iteritems(): fw.write('{}\t{}\n'.format(k,v)) fw.close() else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print "creating model..." self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode") # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) fw = open('{}/config.txt'.format(FLAGS.log_root),'w') for k,v in flags.iteritems(): fw.write('{}\t{}\n'.format(k,v)) fw.close() else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print "creating model..." self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
parser.add_argument('--load_path', type=str) parser.add_argument('--test_data', type=str) parser.add_argument('--test_batch_size', type=int) parser.add_argument('--beam_size', type= int) parser.add_argument('--max_time_step', type=int) parser.add_argument('--output_file', type = str) parser.add_argument('--verbose', action='store_true') return parser.parse_args() if __name__ == "__main__": args = parse_config() ckpt = torch.load(args.load_path) model_args = ckpt['args'] vocab_src = Vocab(model_args.vocab_src, with_SE = False) vocab_tgt = Vocab(model_args.vocab_tgt, with_SE = True) model = ResponseGenerator(vocab_src, vocab_tgt, model_args.embed_dim, model_args.hidden_size, model_args.num_layers, model_args.dropout, model_args.input_feed) model.load_state_dict(ckpt['model']) model = model.cuda() test_data = DataLoader(args.test_data, vocab_src, vocab_tgt, args.test_batch_size, False) model.eval() if args.verbose: queries = [ x.strip().split('|')[0] for x in open(args.test_data).readlines()] qid = 0 with open(args.output_file, 'w') as fo: for batch_dict in test_data:
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') # 创建3个目录 for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec' % (counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): # batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if USE_CUDA: y_t_1 = y_t_1.to(DEVICE) all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
import sys from data import Vocab vocab_tgt = Vocab('../data/golden/vocab_tgt') with open(sys.argv[1]) as f: for line in f.readlines(): x = line.strip().split('|') y = x[-1] z = [int(t) for t in y.split()] iszero = False new_z = [] for w in z: if iszero and w == 0: continue else: new_z.append(w) iszero = (w == 0) print(' '.join([vocab_tgt.i2s(w) for w in new_z]))
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hps_dict = { 'mode': FLAGS.mode, 'rand_unif_init_mag': FLAGS.rand_unif_init_mag, 'trunc_norm_init_std': FLAGS.trunc_norm_init_std, 'max_grad_norm': FLAGS.max_grad_norm, 'hidden_dim': FLAGS.hidden_dim, 'emb_dim': FLAGS.emb_dim, 'batch_size': FLAGS.batch_size, 'max_dec_steps': FLAGS.max_dec_steps, 'max_enc_steps': FLAGS.max_enc_steps, 'pointer_gen': FLAGS.pointer_gen, 'lr': FLAGS.lr, 'keep_prob': FLAGS.keep_prob } hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) eval_batcher = Batcher('eval', vocab, hps, single_pass=False) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'train': print("creating model...") model = MultiRelationModel(hps, vocab) setup_training(model, batcher, eval_batcher) elif hps.mode == 'eval': model = MultiRelationModel(hps, vocab) run_eval(model, batcher) elif hps.mode == 'decode': # decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = MultiRelationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
parser.add_argument('--train_data', type=str) parser.add_argument('--dev_data', type=str) parser.add_argument('--which_ranker', type=str) return parser.parse_args() def update_lr(optimizer, coefficient): for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * coefficient if __name__ == "__main__": random.seed(19940117) torch.manual_seed(19940117) args = parse_config() vocab_src = Vocab(args.vocab_src, with_SE=False) vocab_tgt = Vocab(args.vocab_tgt, with_SE=False) if args.which_ranker == 'ranker': from ranker import Ranker elif args.which_ranker == 'masker_ranker': from masker_ranker import Ranker model = Ranker(vocab_src, vocab_tgt, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.num_layers) model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), args.lr) train_data = DataLoader(args.train_data, vocab_src, vocab_tgt, args.train_batch_size, True) dev_data = DataLoader(args.dev_data, vocab_src, vocab_tgt, args.dev_batch_size, True)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) # Loading the external information first extra_info = {} if os.path.exists(FLAGS.external_config): external_params = xml_parser.parse(FLAGS.external_config, flat=False) if 'sent2vec_params' in external_params: sent2vec_params = external_params['sent2vec_params'] convnet_params = sent2vec_params['convnet_params'] convnet_model2load = sent2vec_params['model2load'] gamma = 0.2 if not 'gamma' in sent2vec_params else sent2vec_params[ 'gamma'] my_convnet = convnet.convnet(convnet_params) my_convnet.train_validate_test_init() my_convnet.load_params(file2load=convnet_model2load) fixed_vars = tf.global_variables() fixed_vars.remove(my_convnet.embedding_matrix) extra_info['sent2vec'] = {'gamma': gamma, 'network': my_convnet} extra_info['fixed_vars'] = fixed_vars if 'key_phrases' in external_params: # TODO: phrase some parameters to import the results of key-phrase extracted or \ # parameters for online key-phrase extraction extra_info['key_phrases'] = {} raise NotImplementedError( 'Key phrases part has not been implemented yet') tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen' ] hps_dict = {} for key, val in FLAGS.__flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'train': print "creating model..." model = SummarizationModel(hps, vocab, extra_info) setup_training(model, batcher) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab, extra_info) run_eval(model, batcher, vocab) elif hps.mode == 'decode': decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab, extra_info) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) pp = pprint.PrettyPrinter() pp.pprint(FLAGS.__flags) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want if FLAGS.model not in ['selector', 'rewriter', 'end2end']: raise ValueError( "The 'model' flag must be one of selector/rewriter/end2end") if FLAGS.mode not in ['train', 'eval', 'evalall']: raise ValueError("The 'mode' flag must be one of train/eval/evalall") tf.logging.info('Starting %s in %s mode...' % (FLAGS.model, FLAGS.mode)) # # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.model, FLAGS.exp_name) # abstractor save path if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in evalall mode, set batch_size = 1 or beam_size # Reason: in evalall mode, we decode one example at a time. # For rewriter, on each step, we have beam_size-many hypotheses in the beam, # so we need to make a batch of these hypotheses. if FLAGS.mode == 'evalall': if FLAGS.model == 'selector': FLAGS.batch_size = 1 else: if FLAGS.decode_method == 'beam': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in evalall mode if FLAGS.single_pass and FLAGS.mode == 'train': raise Exception( "The single_pass flag should not be True in train mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'model', 'mode', 'eval_method', 'selector_loss_wt', 'inconsistent_loss', 'inconsistent_topk', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim_selector', 'hidden_dim_rewriter', 'emb_dim', 'batch_size', 'max_art_len', 'max_sent_len', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'eval_gt_rouge', 'decode_method', 'lr', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'rl_training', 'matrix_attention', 'pointer_gen', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'k', 'calculate_true_q', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'gpu_num', 'enc_hidden_dim', 'dec_hidden_dim', 'scheduled_sampling', 'sampling_probability', 'fixed_sampling_probability', 'hard_argmax', 'greedy_scheduled_sampling', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'ac_training' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict # for val in FLAGS: # if val in hparam_list: # hps_dict[val] = FLAGS[val].value hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) start = time.perf_counter() if FLAGS.model == 'selector': # extractor print(hps.mode) if hps.mode == 'train': # train print("creating model...") model = SentenceSelector(hps, vocab) # init run_selector.setup_training(model, batcher, vocab.getWordEmbedding()) elif hps.mode == 'eval': # evaluation model = SentenceSelector(hps, vocab) run_selector.run_eval(model, batcher, vocab.getWordEmbedding()) elif hps.mode == 'evalall': # test , get rouge and output model = SentenceSelector(hps, vocab) evaluator = SelectorEvaluator(model, batcher, vocab) evaluator.evaluate() elif FLAGS.model == 'rewriter': # abstractor if hps.mode == 'train': print("creating model...") model = Rewriter(hps, vocab) run_rewriter.setup_training(model, batcher, vocab.getWordEmbedding()) elif hps.mode == 'eval': model = Rewriter(hps, vocab) if FLAGS.eval_method == 'loss': vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) run_rewriter.run_eval(model, batcher, vocab.getWordEmbedding()) elif FLAGS.eval_method == 'rouge': assert FLAGS.decode_method == 'greedy' decoder = BeamSearchDecoder(model, batcher, vocab) run_rewriter.run_eval_rouge(decoder) elif hps.mode == 'evalall': decode_model_hps = hps # This will be the hyperparameters for the decoder model if FLAGS.decode_method == 'beam': decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = Rewriter(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.evaluate( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) elif FLAGS.model == 'end2end': # end2end if hps.mode == 'train': print("creating model...") select_model = SentenceSelector(hps, vocab) # extractor init rewrite_model = Rewriter(hps, vocab) # abstractor init end2end_model = SelectorRewriter(hps, select_model, rewrite_model) # end2end init run_end2end.setup_training( end2end_model, batcher, vocab.getWordEmbedding()) # train setting elif hps.mode == 'eval': select_model = SentenceSelector(hps, vocab) rewrite_model = Rewriter(hps, vocab) end2end_model = SelectorRewriter(hps, select_model, rewrite_model) if FLAGS.eval_method == 'loss': run_end2end.run_eval(end2end_model, batcher, vocab.getWordEmbedding()) elif FLAGS.eval_method == 'rouge': assert FLAGS.decode_method == 'greedy' evaluator = End2EndEvaluator(end2end_model, batcher, vocab) run_end2end.run_eval_rouge(evaluator) elif hps.mode == 'evalall': eval_model_hps = hps # This will be the hyperparameters for the decoder model if FLAGS.decode_method == 'beam': eval_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries select_model = SentenceSelector(eval_model_hps, vocab) rewrite_model = Rewriter(eval_model_hps, vocab) end2end_model = SelectorRewriter(hps, select_model, rewrite_model) evaluator = End2EndEvaluator(end2end_model, batcher, vocab) evaluator.evaluate( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) delta = time.perf_counter() - start print("running time: {} seconds".format(delta))
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting running in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary tf.set_random_seed(6) # a seed value for randomness cnn_classifier = CNN(config) #cnn_batcher = ClaBatcher(hps_discriminator, vocab) cnn_batcher = ClaBatcher(FLAGS, vocab) sess_cnn, saver_cnn, train_dir_cnn = setup_training_classifier( cnn_classifier) run_train_cnn_classifier(cnn_classifier, cnn_batcher, 10, sess_cnn, saver_cnn, train_dir_cnn) #util.load_ckpt(saver_cnn, sess_cnn, ckpt_dir="train-classifier") acc = run_test_classification(cnn_classifier, cnn_batcher, sess_cnn, saver_cnn, str('last')) print("the last stored cnn model acc = ", acc) generate_confident_examples(cnn_classifier, cnn_batcher, sess_cnn) ## train_conf print("Start pre-training attention classification......") model_class = Classification(FLAGS, vocab) cla_batcher = AttenBatcher(FLAGS, vocab) # read from train_conf sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) run_pre_train_classification(model_class, cla_batcher, 10, sess_cls, saver_cls, train_dir_cls) #util.load_ckpt(saver_cls, sess_cls, ckpt_dir="train-classification") acc = run_test_classification(model_class, cla_batcher, sess_cls, saver_cls, str("final_acc")) print("the last stored attention model acc = ", acc) acc = run_test_classification(cnn_classifier, cla_batcher, sess_cnn, saver_cnn, str("final_acc")) print("the last stored classifier model acc = ", acc) generated = Generate_training_sample(model_class, vocab, cla_batcher, sess_cls) print("Generating training examples......") generated.generate_training_example("train_filtered") #wirte train generated.generator_validation_example("valid_filtered") model = Seq2seq_AE(FLAGS, vocab) # Create a batcher object that will create minibatches of data batcher = GenBatcher(vocab, FLAGS) ##read from train sess_ge, saver_ge, train_dir_ge = setup_training_generator(model) generated = Generated_sample(model, vocab, batcher, sess_ge) print("Start training generator......") run_pre_train_auto_encoder(model, batcher, 20, sess_ge, saver_ge, train_dir_ge, generated, cnn_classifier, sess_cnn, cla_batcher)
def main(unused_argv): print("unused_argv: ", unused_argv) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) print("FLAGS.vocab_size: ", FLAGS.vocab_size) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary print("vocab size: ", vocab.size()) # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen', 'fine_tune', 'train_size', 'subred_size', 'use_doc_vec', 'use_multi_attn', 'use_multi_pgen', 'use_multi_pvocab', 'create_ckpt' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness # return if hps.mode.value == 'train': print("creating model...") model = SummarizationModel(hps, vocab) # ------------------------------------- if hps.create_ckpt.value: step = 0 model.build_graph() print("get value") pretrained_ckpt = '/home/cs224u/pointer/log/pretrained_model_tf1.2.1/train/model-238410' reader = pywrap_tensorflow.NewCheckpointReader(pretrained_ckpt) var_to_shape_map = reader.get_variable_to_shape_map() value = {} for key in var_to_shape_map: value[key] = reader.get_tensor(key) print("assign op") assign_op = [] if hps.use_multi_pvocab.value: new_key = [ "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_0/Bias", "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_1/Bias" ] for v in tf.trainable_variables(): key = v.name.split(":")[0] if key in new_key: origin_key = "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/" + key.split( "/")[-1] a_op = v.assign(tf.convert_to_tensor( value[origin_key])) else: a_op = v.assign(tf.convert_to_tensor(value[key])) # if key == "seq2seq/embedding/embedding": # a_op = v.assign(tf.convert_to_tensor(value[key])) assign_op.append(a_op) else: for v in tf.trainable_variables(): key = v.name.split(":")[0] if key == "seq2seq/embedding/embedding": a_op = v.assign(tf.convert_to_tensor(value[key])) assign_op.append(a_op) # ratio = 1 # for v in tf.trainable_variables(): # key = v.name.split(":")[0] # # embedding (50000, 128) -> (50000, 32) # if key == "seq2seq/embedding/embedding": # print (key) # print (value[key].shape) # d1 = value[key].shape[1] # a_op = v.assign(tf.convert_to_tensor(value[key][:,:d1//ratio])) # # kernel (384, 1024) -> (96, 256) # # w_reduce_c (512, 256) -> (128, 64) # elif key == "seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel" or \ # key == "seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel" or \ # key == "seq2seq/reduce_final_st/w_reduce_c" or \ # key == "seq2seq/reduce_final_st/w_reduce_h" or \ # key == "seq2seq/decoder/attention_decoder/Linear/Matrix" or \ # key == "seq2seq/decoder/attention_decoder/lstm_cell/kernel" or \ # key == "seq2seq/decoder/attention_decoder/Attention/Linear/Matrix" or \ # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Matrix": # print (key) # print (value[key].shape) # d0, d1 = value[key].shape[0], value[key].shape[1] # a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio, :d1//ratio])) # # bias (1024,) -> (256,) # elif key == "seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias" or \ # key == "seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias" or \ # key == "seq2seq/reduce_final_st/bias_reduce_c" or \ # key == "seq2seq/reduce_final_st/bias_reduce_h" or \ # key == "seq2seq/decoder/attention_decoder/lstm_cell/bias" or \ # key == "seq2seq/decoder/attention_decoder/v" or \ # key == "seq2seq/decoder/attention_decoder/Attention/Linear/Bias" or \ # key == "seq2seq/decoder/attention_decoder/Linear/Bias" or \ # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Bias": # print (key) # print (value[key].shape) # d0 = value[key].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio])) # # W_h (1, 1, 512, 512) -> (1, 1, 128, 128) # elif key == "seq2seq/decoder/attention_decoder/W_h": # print (key) # print (value[key].shape) # d2, d3 = value[key].shape[2], value[key].shape[3] # a_op = v.assign(tf.convert_to_tensor(value[key][:,:,:d2//ratio,:d3//ratio])) # # Matrix (1152, 1) -> (288, 1) # elif key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Matrix" or \ # key == "seq2seq/output_projection/w": # print (key) # print (value[key].shape) # d0 = value[key].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio,:])) # # Bias (1,) -> (1,) # elif key == "seq2seq/output_projection/v" or \ # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Bias": # print (key) # print (value[key].shape) # a_op = v.assign(tf.convert_to_tensor(value[key])) # # multi_attn # if hps.use_multi_attn.value: # if key == "seq2seq/decoder/attention_decoder/attn_0/v" or \ # key == "seq2seq/decoder/attention_decoder/attn_1/v": # # key == "seq2seq/decoder/attention_decoder/attn_2/v": # k = "seq2seq/decoder/attention_decoder/v" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # if key == "seq2seq/decoder/attention_decoder/Attention/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/Attention/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/Attention/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/Attention/Linear/Bias" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # elif hps.use_multi_pgen.value: # if key == "seq2seq/decoder/attention_decoder/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/Linear/Bias" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # if key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Bias" # print (key) # print (value[k].shape) # a_op = v.assign(tf.convert_to_tensor(value[k])) # elif hps.use_multi_pvocab.value: # if key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Bias" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # assign_op.append(a_op) # Add an op to initialize the variables. init_op = tf.global_variables_initializer() # Add ops to save and restore all the variables. saver = tf.train.Saver() with tf.Session(config=util.get_config()) as sess: sess.run(init_op) # Do some work with the model. for a_op in assign_op: a_op.op.run() for _ in range(0): batch = batcher.next_batch() results = model.run_train_step(sess, batch) # Save the variables to disk. if hps.use_multi_attn.value: ckpt_tag = "multi_attn_2_attn_proj" elif hps.use_multi_pgen.value: ckpt_tag = "multi_attn_2_pgen_proj" elif hps.use_multi_pvocab.value: ckpt_tag = "big_multi_attn_2_pvocab_proj" else: ckpt_tag = "pointer_proj" ckpt_to_save = '/home/cs224u/pointer/log/ckpt/' + ckpt_tag + '/model.ckpt-' + str( step) save_path = saver.save(sess, ckpt_to_save) print("Model saved in path: %s" % save_path) # ------------------------------------- else: setup_training(model, batcher, hps) elif hps.mode.value == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode.value == 'decode': decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0] if __name__ == '__main__': article = "近日,一段消防员用叉子吃饭的视频在网上引起热议。原来是因为训练强度太大,半天下来,大家拿筷子的手一直在抖,甚至没法夹菜。于是,用叉子吃饭,渐渐成了上海黄浦消防车站中队饭桌上的传统。转发,向消防员致敬!" model_path = sys.argv[1] vocab = Vocab(config.vocab_path, config.vocab_size) batch = build_batch_by_article(article, vocab) beam_processor = BeamSearch(model_path, vocab) beam_processor.decode(batch)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting running in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): os.makedirs(FLAGS.log_root) config = { 'kernel_sizes': [3, 4, 5], 'edim': FLAGS.emb_dim, 'n_words': FLAGS.vocab_size, 'std_dev': 0.05, 'sentence_len': FLAGS.max_enc_steps, 'n_filters': 100, 'batch_size': FLAGS.batch_size, 'trunc_norm_init_std': FLAGS.trunc_norm_init_std, 'rand_unif_init_mag': FLAGS.rand_unif_init_mag, 'max_grad_norm': FLAGS.max_grad_norm, 'num_classes': FLAGS.num_class } vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) hparam_list = [ 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'source_class', 'num_class', 'data_dir', 'train_file', 'test_file', 'neutral_file', 'neutral_file_filtering', 'max_epochs' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict) tf.set_random_seed(111) # Batcher: 데이터 준비하는 부분 train_batcher = TrainBatcher(hps_discriminator, vocab) test_batcher = TestBatcher(hps_discriminator, vocab) # CNN class 초기화 cnn_classifier = CNN(config) # CNN model 초기화 (TF-level) sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier( cnn_classifier) run_train_cnn_classifier(model=cnn_classifier, train_batcher=train_batcher, test_batcher=test_batcher, max_run_epoch=hps_discriminator.max_epochs, sess=sess_cnn_cls, saver=saver_cnn_cls, train_dir=train_dir_cnn_cls) neutral_batcher = NeutralBatcher(hps_discriminator, vocab) filter_neutral_data(batcher=neutral_batcher, model=cnn_classifier, sess=sess_cnn_cls, hps=hps_discriminator)
def run(args, local_rank): """ Distributed Synchronous """ torch.manual_seed(1234) vocab = Vocab(args.vocab, min_occur_cnt=args.min_occur_cnt, specials=[]) if (args.world_size == 1 or dist.get_rank() == 0): print (vocab.size) model = BIGLM(local_rank, vocab, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.layers, args.approx) if args.start_from is not None: ckpt = torch.load(args.start_from, map_location='cpu') model.load_state_dict(ckpt['model']) model = model.cuda(local_rank) weight_decay_params = [] no_weight_decay_params = [] for name, param in model.named_parameters(): if name.endswith('bias') or 'layer_norm' in name: no_weight_decay_params.append(param) else: weight_decay_params.append(param) grouped_params = [{'params':weight_decay_params, 'weight_decay':0.01}, {'params':no_weight_decay_params, 'weight_decay':0.}] if args.world_size > 1: torch.manual_seed(1234 + dist.get_rank()) random.seed(5678 + dist.get_rank()) if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") optimizer = FusedAdam(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps =1e-6, bias_correction=False, max_grad_norm=1.0) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = AdamWeightDecayOptimizer(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-6) if args.start_from is not None: optimizer.load_state_dict(ckpt['optimizer']) train_data = DataLoader(vocab, args.train_data, args.batch_size, args.max_len) batch_acm = 0 acc_acm, ntokens_acm, npairs_acm, loss_acm = 0., 0., 0., 0. while True: model.train() for truth, inp, msk in train_data: batch_acm += 1 if batch_acm <= args.warmup_steps: update_lr(optimizer, args.lr*batch_acm/args.warmup_steps) truth = truth.cuda(local_rank) inp = inp.cuda(local_rank) msk = msk.cuda(local_rank) optimizer.zero_grad() res, loss, acc, ntokens, npairs = model(truth, inp, msk) loss_acm += loss.item() acc_acm += acc ntokens_acm += ntokens npairs_acm += npairs if args.fp16: optimizer.backward(loss) else: loss.backward() if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.print_every == -1%args.print_every: print ('batch_acm %d, loss %.3f, acc %.3f, x_acm %d'%(batch_acm, loss_acm/args.print_every, acc_acm/ntokens_acm, npairs_acm)) acc_acm, ntokens_acm, loss_acm = 0., 0., 0. if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.save_every == -1%args.save_every: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) torch.save({'args':args, 'model':model.state_dict(), 'optimizer':optimizer.state_dict()}, '%s/epoch%d_batch_%d'%(args.save_dir, train_data.epoch_id, batch_acm))
def main(unused_argv): tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting running in %s mode...', (FLAGS.mode)) #創建字典 vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_sen_num', 'max_dec_steps', 'max_enc_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): if key in hparam_list: hps_dict[key] = val.value # add it to the dict hps_generator = namedtuple("HParams", hps_dict.keys())(**hps_dict) hparam_list = [ 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_enc_sen_num', 'max_enc_seq_len' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: hps_dict[key] = val.value # add it to the dict hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict) # # 取出最小batch size 的資料量 batcher = GenBatcher(vocab, hps_generator) # print(batcher.train_batch[0].original_review_inputs) # print(len(batcher.train_batch[0].original_review_inputs)) tf.set_random_seed(123) if FLAGS.mode == 'train_generator': # print("Start pre-training ......") ge_model = Generator(hps_generator, vocab) sess_ge, saver_ge, train_dir_ge = setup_training_generator(ge_model) generated = Generated_sample(ge_model, vocab, batcher, sess_ge) print("Start pre-training generator......") # run_pre_train_generator(ge_model, batcher, 1000, sess_ge, saver_ge, train_dir_ge) util.load_ckpt(saver_ge, sess_ge, ckpt_dir="train-generator") print("finish load train-generator") print("Generating negative examples......") generated.generator_train_negative_example() generated.generator_test_negative_example() print("finish write") elif FLAGS.mode == 'train_discriminator': # print("Start pre-training ......") model_dis = Discriminator(hps_discriminator, vocab) dis_batcher = DisBatcher(hps_discriminator, vocab, "discriminator_train/positive/*", "discriminator_train/negative/*", "discriminator_test/positive/*", "discriminator_test/negative/*") sess_dis, saver_dis, train_dir_dis = setup_training_discriminator( model_dis) print("Start pre-training discriminator......") if not os.path.exists("discriminator_result"): os.mkdir("discriminator_result") run_pre_train_discriminator(model_dis, dis_batcher, 1000, sess_dis, saver_dis, train_dir_dis) elif FLAGS.mode == "adversarial_train": generator_graph = tf.Graph() discriminatorr_graph = tf.Graph() print("Start adversarial-training......") # tf.reset_default_graph() with generator_graph.as_default(): model = Generator(hps_generator, vocab) sess_ge, saver_ge, train_dir_ge = setup_training_generator(model) generated = Generated_sample(model, vocab, batcher, sess_ge) util.load_ckpt(saver_ge, sess_ge, ckpt_dir="train-generator") print("finish load train-generator") with discriminatorr_graph.as_default(): model_dis = Discriminator(hps_discriminator, vocab) dis_batcher = DisBatcher(hps_discriminator, vocab, "discriminator_train/positive/*", "discriminator_train/negative/*", "discriminator_test/positive/*", "discriminator_test/negative/*") sess_dis, saver_dis, train_dir_dis = setup_training_discriminator( model_dis) util.load_ckpt(saver_dis, sess_dis, ckpt_dir="train-discriminator") print("finish load train-discriminator") print("Start adversarial training......") if not os.path.exists("train_sample_generated"): os.mkdir("train_sample_generated") if not os.path.exists("test_max_generated"): os.mkdir("test_max_generated") if not os.path.exists("test_sample_generated"): os.mkdir("test_sample_generated") # whole_decay = False # for epoch in range(100): # print('開始訓練') # batches = batcher.get_batches(mode = 'train') # for step in range(int(len(batches)/20)): # run_train_generator(model, model_dis, sess_dis, batcher, dis_batcher, batches[step*20:(step+1)*20], sess_ge, saver_ge, train_dir_ge) # generated.generator_sample_example( # "train_sample_generated/" + str(epoch) + "epoch_step" + str(step) + "_temp_positive", # "train_sample_generated/" + str(epoch) + "epoch_step" + str(step) + "_temp_negative", 20) # tf.logging.info("test performance: ") # tf.logging.info("epoch: "+str(epoch)+" step: "+str(step)) # print("evaluate the diversity of DP-GAN (decode based on max probability)") # generated.generator_test_sample_example( # "test_sample_generated/" + str(epoch) + "epoch_step" + str(step) + "_temp_positive", # "test_sample_generated/" + str(epoch) + "epoch_step" + str(step) + "_temp_negative", 20) # print("evaluate the diversity of DP-GAN (decode based on sampling)") # generated.generator_test_max_example( # "test_max_generated/" + str(epoch) + "epoch_step" + str(step) + "_temp_positive", # "test_max_generated/" + str(epoch) + "epoch_step" + str(step) + "_temp_negative", 20) # dis_batcher.train_queue = [] # dis_batcher.train_queue = [] # for i in range(epoch + 1): # for j in range(step + 1): # dis_batcher.train_queue += dis_batcher.fill_example_queue("train_sample_generated/"+str(i)+"epoch_step"+str(j)+"_temp_positive/*") # dis_batcher.train_queue += dis_batcher.fill_example_queue("train_sample_generated/"+str(i)+"epoch_step"+str(j)+"_temp_negative/*") # dis_batcher.train_batch = dis_batcher.create_batches(mode="train", shuffleis=True) # whole_decay = run_train_discriminator(model_dis, 5, dis_batcher, dis_batcher.get_batches(mode="train"), sess_dis, saver_dis, train_dir_dis, whole_decay) elif FLAGS.mode == "test_language_model": ge_model = Generator(hps_generator, vocab) sess_ge, saver_ge, train_dir_ge = setup_training_generator(ge_model) # saver_ge.restore(sess_ge, "train-generator/model-31200") util.load_ckpt(saver_ge, sess_ge, ckpt_dir="train-generator") print("finish load train-generator") jieba.load_userdict('dir.txt') inputs = '' while inputs != "close": inputs = input("Enter your ask: ") sentence = jieba.cut(inputs) sentence = (" ".join(sentence)) print(sentence) sentence = sentence.split() enc_input = [vocab.word2id(w) for w in sentence] enc_lens = np.array([len(enc_input)]) enc_input = np.array([enc_input]) out_sentence = ('[START]').split() dec_batch = [vocab.word2id(w) for w in out_sentence] #dec_batch = [2] + dec_batch #dec_batch.append(3) while len(dec_batch) < 40: dec_batch.append(1) dec_batch = np.array([dec_batch]) dec_batch = np.resize(dec_batch, (1, 1, 40)) dec_lens = np.array([len(dec_batch)]) result = ge_model.run_test_language_model(sess_ge, enc_input, enc_lens, dec_batch, dec_lens) output_ids = [int(t) for t in result['generated'][0][0]][1:] decoded_words = data.outputids2words(output_ids, vocab, None) print("decoded_words :", decoded_words) try: if decoded_words[0] == '[STOPDOC]': decoded_words = decoded_words[1:] fst_stop_idx = decoded_words.index( data.STOP_DECODING_DOCUMENT ) # index of the (first) [STOP] symbol decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words if decoded_words[-1] != '.' and decoded_words[ -1] != '!' and decoded_words[-1] != '?': decoded_words.append('.') decoded_words_all = [] decoded_output = ' '.join(decoded_words).strip() # single string decoded_words_all.append(decoded_output) decoded_words_all = ' '.join(decoded_words_all).strip() decoded_words_all = decoded_words_all.replace("[UNK] ", "") decoded_words_all = decoded_words_all.replace("[UNK]", "") decoded_words_all = decoded_words_all.replace(" ", "") decoded_words_all, _ = re.subn(r"(! ){2,}", "", decoded_words_all) decoded_words_all, _ = re.subn(r"(\. ){2,}", "", decoded_words_all) if decoded_words_all.startswith(','): decoded_words_all = decoded_words_all[1:] print("The resonse : {}".format(decoded_words_all))
def reset_parameters(self): for layer in self.layers: nn.init.normal_(layer.weight, std=0.02) nn.init.constant_(layer.bias[self.input_dim:], 1) nn.init.constant_(layer.bias[:self.input_dim], 0) def forward(self, x): for layer in self.layers: new_x = layer(x) new_x, gate = new_x.chunk(2, dim=-1) new_x = F.relu(new_x) gate = torch.sigmoid(gate) x = gate * x + (1 - gate) * new_x return x if __name__ == "__main__": from data import Vocab, CLS, DUM, END vocab = Vocab('../data/AMR/amr_1.0_reca/lem_vocab', 3, [CLS]) embed = AMREmbedding(vocab, 300, pretrained_file='../data/glove.840B.300d.txt', dump_file='../data/AMR/amr_1.0_reca/glove_lem_embed') vocab = Vocab('../data/AMR/amr_1.0_reca/concept_vocab', 3, [DUM, END]) embed = AMREmbedding( vocab, 300, pretrained_file='../data/glove.840B.300d.txt', amr=True, dump_file='../data/AMR/amr_1.0_reca/glove_concept_embed')
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting running in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): os.makedirs(FLAGS.log_root) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_generator = namedtuple("HParams", hps_dict.keys())(**hps_dict) hparam_list = [ 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict) tf.set_random_seed( 111 ) # a seed value for randomness # train-classification train-sentiment train-cnn-classificatin train-generator if FLAGS.mode == "train-classifier": #print("Start pre-training......") model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) print("Start pre-training classification......") run_pre_train_classification(model_class, cla_batcher, 1, sess_cls, saver_cls, train_dir_cls) #10 generated = Generate_training_sample(model_class, vocab, cla_batcher, sess_cls) print("Generating training examples......") generated.generate_training_example("train") generated.generate_test_example("test") elif FLAGS.mode == "train-sentimentor": model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) print("Start pre_train_sentimentor......") model_sentiment = Sentimentor(hps_generator, vocab) sentiment_batcher = SenBatcher(hps_generator, vocab) sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor( model_sentiment) util.load_ckpt(saver_cls, sess_cls, ckpt_dir="train-classification") run_pre_train_sentimentor(model_sentiment, sentiment_batcher, 1, sess_sen, saver_sen, train_dir_sen) #1 elif FLAGS.mode == "test": config = { 'n_epochs': 5, 'kernel_sizes': [3, 4, 5], 'dropout_rate': 0.5, 'val_split': 0.4, 'edim': 300, 'n_words': None, # Leave as none 'std_dev': 0.05, 'sentence_len': 50, 'n_filters': 100, 'batch_size': 50 } config['n_words'] = 50000 cla_cnn_batcher = CNN_ClaBatcher(hps_discriminator, vocab) cnn_classifier = CNN(config) sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier( cnn_classifier) #util.load_ckpt(saver_cnn_cls, sess_cnn_cls, ckpt_dir="train-cnnclassification") run_train_cnn_classifier(cnn_classifier, cla_cnn_batcher, 1, sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls) #1 files = os.listdir("test-generate-transfer/") for file_ in files: run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls, "test-generate-transfer/" + file_ + "/*") #elif FLAGS.mode == "test": elif FLAGS.mode == "train-generator": model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) model_sentiment = Sentimentor(hps_generator, vocab) sentiment_batcher = SenBatcher(hps_generator, vocab) sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor( model_sentiment) config = { 'n_epochs': 5, 'kernel_sizes': [3, 4, 5], 'dropout_rate': 0.5, 'val_split': 0.4, 'edim': 300, 'n_words': None, # Leave as none 'std_dev': 0.05, 'sentence_len': 50, 'n_filters': 100, 'batch_size': 50 } config['n_words'] = 50000 cla_cnn_batcher = CNN_ClaBatcher(hps_discriminator, vocab) cnn_classifier = CNN(config) sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier( cnn_classifier) model = Generator(hps_generator, vocab) batcher = GenBatcher(vocab, hps_generator) sess_ge, saver_ge, train_dir_ge = setup_training_generator(model) #util.load_ckpt(saver_cnn_cls, sess_cnn_cls, ckpt_dir="train-cnnclassification") util.load_ckpt(saver_sen, sess_sen, ckpt_dir="train-sentimentor") generated = Generated_sample(model, vocab, batcher, sess_ge) print("Start pre-training generator......") run_pre_train_generator(model, batcher, 1, sess_ge, saver_ge, train_dir_ge, generated, cla_cnn_batcher, cnn_classifier, sess_cnn_cls) # 4 generated.generate_test_negetive_example( "temp_negetive", batcher) # batcher, model_class, sess_cls, cla_batcher generated.generate_test_positive_example("temp_positive", batcher) #run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls, # "temp_negetive" + "/*") loss_window = 0 t0 = time.time() print("begin reinforcement learning:") for epoch in range(30): batches = batcher.get_batches(mode='train') for i in range(len(batches)): current_batch = copy.deepcopy(batches[i]) sentiment_batch = batch_sentiment_batch( current_batch, sentiment_batcher) result = model_sentiment.max_generator(sess_sen, sentiment_batch) weight = result['generated'] current_batch.weight = weight sentiment_batch.weight = weight cla_batch = batch_classification_batch(current_batch, batcher, cla_batcher) result = model_class.run_ypred_auc(sess_cls, cla_batch) cc = SmoothingFunction() reward_sentiment = 1 - np.abs(0.5 - result['y_pred_auc']) reward_BLEU = [] for k in range(FLAGS.batch_size): reward_BLEU.append( sentence_bleu( [current_batch.original_reviews[k].split()], cla_batch.original_reviews[k].split(), smoothing_function=cc.method1)) reward_BLEU = np.array(reward_BLEU) reward_de = (2 / (1.0 / (1e-6 + reward_sentiment) + 1.0 / (1e-6 + reward_BLEU))) result = model.run_train_step(sess_ge, current_batch) train_step = result[ 'global_step'] # we need this to update our running average loss loss = result['loss'] loss_window += loss if train_step % 100 == 0: t1 = time.time() tf.logging.info( 'seconds for %d training generator step: %.3f ', train_step, (t1 - t0) / 100) t0 = time.time() tf.logging.info('loss: %f', loss_window / 100) # print the loss to screen loss_window = 0.0 if train_step % 10000 == 0: generated.generate_test_negetive_example( "test-generate-transfer/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive", batcher) generated.generate_test_positive_example( "test-generate/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive", batcher) #saver_ge.save(sess, train_dir + "/model", global_step=train_step) #run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls, # "test-generate-transfer/" + str(epoch) + "epoch_step" + str( # train_step) + "_temp_positive" + "/*") cla_batch, bleu = output_to_classification_batch( result['generated'], current_batch, batcher, cla_batcher, cc) result = model_class.run_ypred_auc(sess_cls, cla_batch) reward_result_sentiment = result['y_pred_auc'] reward_result_bleu = np.array(bleu) reward_result = (2 / (1.0 / (1e-6 + reward_result_sentiment) + 1.0 / (1e-6 + reward_result_bleu))) current_batch.score = 1 - current_batch.score result = model.max_generator(sess_ge, current_batch) cla_batch, bleu = output_to_classification_batch( result['generated'], current_batch, batcher, cla_batcher, cc) result = model_class.run_ypred_auc(sess_cls, cla_batch) reward_result_transfer_sentiment = result['y_pred_auc'] reward_result_transfer_bleu = np.array(bleu) reward_result_transfer = ( 2 / (1.0 / (1e-6 + reward_result_transfer_sentiment) + 1.0 / (1e-6 + reward_result_transfer_bleu))) #tf.logging.info("reward_nonsentiment: "+str(reward_sentiment) +" output_original_sentiment: "+str(reward_result_sentiment)+" output_original_bleu: "+str(reward_result_bleu)) reward = reward_result_transfer #reward_de + reward_result_sentiment + #tf.logging.info("reward_de: "+str(reward_de)) model_sentiment.run_train_step(sess_sen, sentiment_batch, reward)
def evaluate_model(self, eval_file, gpus): self.device = torch.device( "cuda:" + str(gpus) if torch.cuda.is_available() else "cpu") print('device', self.device) test_models = [] if os.path.isdir(eval_file): for file in os.listdir(eval_file): fname = os.path.join(eval_file, file) if os.path.isfile(fname): test_models.append(fname) model_args = torch.load(fname, map_location=self.device)['args'] else: test_models.append(eval_file) model_args = torch.load(eval_file, map_location=self.device)['args'] from data import Vocab, DataLoader, STR, END, CLS, SEL, TL, rCLS model_args = collections.namedtuple("HParams", sorted( model_args.keys()))(**model_args) vocabs = dict() vocabs['concept'] = Vocab(model_args.concept_vocab, 5, [CLS]) vocabs['token'] = Vocab(model_args.token_vocab, 5, [STR, END]) vocabs['token_char'] = Vocab(model_args.token_char_vocab, 100, [STR, END]) vocabs['concept_char'] = Vocab(model_args.concept_char_vocab, 100, [STR, END]) vocabs['relation'] = Vocab(model_args.relation_vocab, 5, [CLS, rCLS, SEL, TL]) lexical_mapping = LexicalMap() if self.args.encoder_: vocabs, lexical_mapping = self._prepare_data() config_class, model_class, tokenizer_class = MODEL_CLASSES[ self.args.encoder_type] bert_config = config_class.from_pretrained(self.args.lm_model, ) bert_tokenizer = tokenizer_class.from_pretrained( self.args.lm_model) bert_model = model_class.from_pretrained( self.args.lm_model, from_tf=bool(".ckpt" in self.args.lm_model), config=self.args.lm_model, ).to(self.device) eval_model = Reasoning_AMR_CN_DUAL( vocabs, model_args.concept_char_dim, model_args.concept_dim, model_args.cnn_filters, model_args.char2concept_dim, model_args.rel_dim, model_args.rnn_hidden_size, model_args.rnn_num_layers, model_args.embed_dim, model_args.bert_embed_dim, model_args.ff_embed_dim, model_args.num_heads, model_args.dropout, model_args.snt_layer, model_args.graph_layers, model_args.pretrained_file, self.device, model_args.batch_size, model_args.lm_model, bert_config, bert_model, bert_tokenizer, model_args.bert_max_length, model_args.n_answers, model_args.encoder_type, model_args.gcn_concept_dim, model_args.gcn_hidden_dim, model_args.gcn_output_dim, model_args.max_conceptnet_length, model_args.conceptnet_path, ) else: eval_model = '' test_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.test_data, model_args.batch_size, for_train='Eval') answer_tempelate = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'} # Evaluate! logger.info("***** Running Evaluating *****") logger.info(" Task: %s", self.args.task) logger.info(" Num examples = %d", len(test_data)) logger.info(" Running Language Model = %s", model_args.lm_model) logger.info(" Running Model = %s", model_args.encoder_type) logger.info(" Running File = %s", eval_file) logger.info(" Test data = %s", self.args.test_data) for test_model in test_models: eval_model.load_state_dict( torch.load(test_model, map_location=self.device)['model']) eval_model = eval_model.cuda(self.device) eval_model.eval() running_corrects = 0 eval_loss_sum, batch_acm = 0, 0 with open(test_model + model_args.prefix + '.csv', 'w', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) for batch in test_data: batch = move_to_cuda(batch, self.device) eval_logits, eval_labels, ans_ids, = eval_model( batch, train=False) eval_logits_forpred = eval_logits.clone().detach() pred_values, pred_indices = torch.max( eval_logits_forpred, 1) eval_labels = eval_labels.tolist() eval_pred = pred_indices.tolist() corrects = [ i for i, j in zip(eval_labels, eval_pred) if i == j ] batch_acm += 1 # Statistics running_corrects += len(corrects) for i, pred in enumerate(eval_pred): csvwriter.writerow([ ans_ids[i], answer_tempelate[int(pred_indices[i])] ]) print('Overall accuracy: ', (running_corrects / len(test_data)))
class Seq2Seq(object): def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99): """Calculate the running average loss via exponential decay. This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve. Args: loss: loss on the most recent eval step running_avg_loss: running_avg_loss so far summary_writer: FileWriter object to write for tensorboard step: training iteration step decay: rate of exponential decay, a float between 0 and 1. Larger is smoother. Returns: running_avg_loss: new running average loss """ if running_avg_loss == 0: # on the first iteration just take the loss running_avg_loss = loss else: running_avg_loss = running_avg_loss * decay + (1 - decay) * loss running_avg_loss = min(running_avg_loss, 12) # clip loss_sum = tf.Summary() tag_name = 'running_avg_loss/decay=%f' % (decay) loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss) self.summary_writer.add_summary(loss_sum, step) tf.logging.info('running_avg_loss: %f', running_avg_loss) return running_avg_loss def restore_best_model(self): """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory""" tf.logging.info("Restoring bestmodel for training...") # Initialize all vars in the model sess = tf.Session(config=util.get_config()) print("Initializing all variables...") sess.run(tf.initialize_all_variables()) # Restore the best model from eval dir saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name]) print("Restoring all non-adagrad variables from best model in eval dir...") curr_ckpt = util.load_ckpt(saver, sess, "eval") print("Restored %s." % curr_ckpt) # Save this model to train dir and quit new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model") new_fname = os.path.join(FLAGS.log_root, "train", new_model_name) print("Saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables new_saver.save(sess, new_fname) print("Saved.") exit() def restore_best_eval_model(self): # load best evaluation loss so far best_loss = None best_step = None # goes through all event files and select the best loss achieved and return it event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root))) for ef in event_files: try: for e in tf.train.summary_iterator(ef): for v in e.summary.value: step = e.step if 'running_avg_loss/decay' in v.tag: running_avg_loss = v.simple_value if best_loss is None or running_avg_loss < best_loss: best_loss = running_avg_loss best_step = step except: continue tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step)) return best_loss def convert_to_coverage_model(self): """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint""" tf.logging.info("converting non-coverage model to coverage model..") # initialize an entire coverage model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-coverage weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name]) print("restoring non-coverage variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_cov_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def convert_to_reinforce_model(self): """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint""" tf.logging.info("converting non-reinforce model to reinforce model..") # initialize an entire reinforce model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-reinforce weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name]) print("restoring non-reinforce variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_rl_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def setup_training(self): """Does setup before starting training (run_training)""" train_dir = os.path.join(FLAGS.log_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if FLAGS.ac_training: dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train") if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir) #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl") #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir) self.model.build_graph() # build the graph if FLAGS.convert_to_reinforce_model: assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True" self.convert_to_reinforce_model() if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" self.convert_to_coverage_model() if FLAGS.restore_best_model: self.restore_best_model() saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time # Loads pre-trained word-embedding. By default the model learns the embedding. if FLAGS.embedding: self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) word_vector = self.vocab.getWordEmbedding() self.sv = tf.train.Supervisor(logdir=train_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.model.global_step, init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None ) self.summary_writer = self.sv.summary_writer self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config()) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() # We create a separate graph for DDQN self.dqn_graph = tf.Graph() with self.dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir, is_chief=True, saver=dqn_saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.dqn.global_step, ) self.dqn_summary_writer = self.dqn_sv.summary_writer self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config()) ''' #### TODO: try loading a previously saved replay buffer # right now this doesn't work due to running DQN on a thread if os.path.exists(replaybuffer_pcl_path): tf.logging.info('Loading Replay Buffer...') try: self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb")) tf.logging.info('Replay Buffer loaded...') except: tf.logging.info('Couldn\'t load Replay Buffer file...') self.replay_buffer = ReplayBuffer(self.dqn_hps) else: self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1)) ''' self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Preparing or waiting for session...") tf.logging.info("Created session.") try: self.run_training() # this is an infinite loop until interrupted except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() if FLAGS.ac_training: self.dqn_sv.stop() def run_training(self): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" tf.logging.info("Starting run_training") if FLAGS.debug: # start the tensorflow debugger self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.train_step = 0 if FLAGS.ac_training: # DDQN training is done asynchronously along with model training tf.logging.info('Starting DQN training thread...') self.dqn_train_step = 0 self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() watcher = Thread(target=self.watch_threads) watcher.daemon = True watcher.start() # starting the main thread tf.logging.info('Starting Seq2Seq training...') while True: # repeats until interrupted batch = self.batcher.next_batch() t0=time.time() if FLAGS.ac_training: # For DDQN, we first collect the model output to calculate the reward and Q-estimates # Then we fix the estimation either using our target network or using the true Q-values # This process will usually take time and we are working on improving it. transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q-values collection time: {}'.format(time.time()-t0)) # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph with self.dqn_graph.as_default(): batch_len = len(transitions) # we use current decoder state to predict q_estimates, use_state_prime = False b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs) # we also get the next decoder state to correct the estimation, use_state_prime = True b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) # use current DQN to estimate values from current decoder state dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] #dqn_q_estimate_loss = dqn_results['loss'] # use target DQN to estimate values for the next decoder state dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov # we use the q_estimate of UNK token for all the OOV tokens q_estimates = np.concatenate([q_estimates, np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1) # modify Q-estimates using the result collected from current and target DQN. # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] # use scheduled sampling to whether use true Q-values or DDQN estimation if FLAGS.dqn_scheduled_sampling: q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DDQN based on true Q-values, # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) # Once we are done with modifying Q-values, we can use them to train the DDQN model. # In this paper, we use a priority experience buffer which always selects states with higher quality # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer. # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training # are full, the DDQN will start the training. self.replay_buffer.add(transitions) # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for # DDQN pre-training if FLAGS.dqn_pretrain: tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...') continue # if not, use the q_estimation to update the loss. results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates) else: results = self.model.run_train_steps(self.sess, batch, self.train_step) t1=time.time() # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer self.train_step = results['global_step'] # we need this to update our running average loss tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0)) printer_helper = {} printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) tf.logging.info('-------------------------------------------') self.summary_writer.add_summary(summaries, self.train_step) # write the summaries if self.train_step % 100 == 0: # flush the summary writer every so often self.summary_writer.flush() if FLAGS.ac_training: self.dqn_summary_writer.flush() if self.train_step > FLAGS.max_iter: break def dqn_training(self): """ training the DDQN network.""" try: while True: if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit() _t = time.time() self.avg_dqn_loss = [] avg_dqn_target_loss = [] # Get a batch of size dqn_batch_size from replay buffer to train the model dqn_batch = self.replay_buffer.next_batch() if dqn_batch is None: tf.logging.info('replay buffer not loaded enough yet...') time.sleep(60) continue # Run train step for Current DQN model and collect the results dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch) # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True) self.dqn_train_step = dqn_results['global_step'] self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries self.avg_dqn_loss.append(dqn_results['loss']) avg_dqn_target_loss.append(dqn_target_results['loss']) self.dqn_train_step = self.dqn_train_step + 1 tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t)) # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL with self.dqn_graph.as_default(): current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss))) tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss))) # sleeping is required if you want the keyboard interuption to work time.sleep(FLAGS.dqn_sleep_time) except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() self.dqn_sv.stop() def watch_threads(self): """Watch example queue and batch queue threads and restart if dead.""" while True: time.sleep(60) if not self.thrd_dqn_training.is_alive(): # if the thread is dead tf.logging.error('Found DQN Learning thread dead. Restarting.') self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() def run_eval(self): """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" self.model.build_graph() # build the graph saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time sess = tf.Session(config=util.get_config()) if FLAGS.embedding: sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector}) eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved self.summary_writer = tf.summary.FileWriter(eval_dir) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() dqn_graph = tf.Graph() with dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time dqn_sess = tf.Session(config=util.get_config()) dqn_train_step = 0 replay_buffer = ReplayBuffer(self.dqn_hps) running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping best_loss = self.restore_best_eval_model() # will hold the best loss achieved so far train_step = 0 while True: _ = util.load_ckpt(saver, sess) # load a new checkpoint if FLAGS.ac_training: _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint processed_batch = 0 avg_losses = [] # evaluate for 100 * batch_size before comparing the loss # we do this due to memory constraint, best to run eval on different machines with large batch size while processed_batch < 100*FLAGS.batch_size: processed_batch += FLAGS.batch_size batch = self.batcher.next_batch() # get the next batch if FLAGS.ac_training: t0 = time.time() transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q values collection time: {}'.format(time.time()-t0)) with dqn_graph.as_default(): # if using true Q-value to train DQN network, # we do this as the pre-training for the DQN network to get better estimates batch_len = len(transitions) b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] tf.logging.info('running test step on dqn_target') dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1) tf.logging.info('fixing the action q-estimates') for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] if FLAGS.dqn_scheduled_sampling: tf.logging.info('scheduled sampling on q-estimates') q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DQN based on true Q-values # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step, q_estimates) t1=time.time() else: tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step) t1=time.time() tf.logging.info('experiment: {}'.format(FLAGS.exp_name)) tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0)) printer_helper = {} loss = printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) # add summaries summaries = results['summaries'] train_step = results['global_step'] self.summary_writer.add_summary(summaries, train_step) # calculate running avg loss avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step)) tf.logging.info('-------------------------------------------') running_avg_loss = np.mean(avg_losses) tf.logging.info('==========================================') tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss)) tf.logging.info('==========================================') # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best') best_loss = running_avg_loss # flush the summary writer every so often if train_step % 100 == 0: self.summary_writer.flush() #time.sleep(600) # run eval every 10 minute def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) fw = open('{}/config.txt'.format(FLAGS.log_root), 'w') for k, v in flags.items(): fw.write('{}\t{}\n'.format(k, v)) fw.close() self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print("creating model...") self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode") # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) extractor = 'bert' if FLAGS.use_bert else 'lambdamart' pretrained_dataset = FLAGS.dataset_name if FLAGS.dataset_name == 'duc_2004': pretrained_dataset = 'cnn_dm' if FLAGS.singles_and_pairs == 'both': FLAGS.exp_name = FLAGS.dataset_name + '_' + FLAGS.exp_name + extractor + '_both' FLAGS.pretrained_path = os.path.join(FLAGS.log_root, pretrained_dataset + '_both') dataset_articles = FLAGS.dataset_name else: FLAGS.exp_name = FLAGS.dataset_name + '_' + FLAGS.exp_name + extractor + '_singles' FLAGS.pretrained_path = os.path.join(FLAGS.log_root, pretrained_dataset + '_singles') dataset_articles = FLAGS.dataset_name + '_singles' if FLAGS.upper_bound: FLAGS.exp_name = FLAGS.exp_name + '_upperbound' ssi_list = None # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth) else: my_log_dir = os.path.join( log_dir, '%s_%s_%s' % (FLAGS.dataset_name, extractor, FLAGS.singles_and_pairs)) with open(os.path.join(my_log_dir, 'ssi.pkl'), 'rb') as f: ssi_list = pickle.load(f) print('Running statistics on %s' % FLAGS.exp_name) if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if not os.path.exists(os.path.join( FLAGS.data_root, FLAGS.dataset_name)) or len( os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: raise Exception('No TF example data found at %s.' % os.path.join(FLAGS.data_root, FLAGS.dataset_name)) logging.set_verbosity( logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) print(util.bcolors.OKGREEN + "Experiment path: " + FLAGS.log_root + util.bcolors.ENDC) if FLAGS.dataset_name == 'duc_2004': vocab = Vocab(FLAGS.vocab_path + '_' + 'cnn_dm', FLAGS.vocab_size) # create a vocabulary else: vocab = Vocab(FLAGS.vocab_path + '_' + FLAGS.dataset_name, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ item for item in list(FLAGS.flag_values_dict().keys()) if item != '?' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", list(hps_dict.keys()))(**hps_dict) tf.set_random_seed(113) # a seed value for randomness decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(FLAGS.data_root, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len( source_files ) * 1000 if 'cnn' in dataset_articles or 'xsum' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) # batcher = Batcher(None, vocab, hps, single_pass=FLAGS.single_pass) model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, None, vocab) decoder.decode_iteratively(example_generator, total, names_to_types, ssi_list, hps)
def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) fw = open('{}/config.txt'.format(FLAGS.log_root), 'w') for k, v in flags.items(): fw.write('{}\t{}\n'.format(k, v)) fw.close() self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print("creating model...") self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen', 'vocab_size' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'train': print("creating model...") model = SummarizationModel(hps, vocab) setup_training(model, batcher) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode == 'decode': decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def main(args, local_rank): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) vocabs = dict() vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS]) vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS]) if args.world_size == 1 or (dist.get_rank() == 0): logger.info(args) for name in vocabs: logger.info("vocab %s, size %d, coverage %.3f", name, vocabs[name].size, vocabs[name].coverage) set_seed(19940117) #device = torch.device('cpu') torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) if args.resume_ckpt: model = MatchingModel.from_pretrained(vocabs, args.resume_ckpt) else: model = MatchingModel.from_params(vocabs, args.layers, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.output_dim, args.bow) if args.world_size > 1: set_seed(19940117 + dist.get_rank()) model = model.to(device) if args.resume_ckpt: dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, addition=args.additional_negs) acc = validate(model, dev_data, device) logger.info("initialize from %s, initial acc %.2f", args.resume_ckpt, acc) optimizer = Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) lr_schedule = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, args.total_train_steps) train_data = DataLoader(vocabs, args.train_data, args.per_gpu_train_batch_size, worddrop=args.worddrop, addition=args.additional_negs) global_step, step, epoch = 0, 0, 0 tr_stat = Statistics() logger.info("start training") model.train() while global_step <= args.total_train_steps: for batch in train_data: batch = move_to_device(batch, device) loss, acc, bsz = model(batch['src_tokens'], batch['tgt_tokens'], args.label_smoothing) tr_stat.update({ 'loss': loss.item() * bsz, 'nsamples': bsz, 'acc': acc * bsz }) tr_stat.step() loss.backward() step += 1 if not (step % args.gradient_accumulation_steps == -1 % args.gradient_accumulation_steps): continue if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() lr_schedule.step() optimizer.zero_grad() global_step += 1 if args.world_size == 1 or (dist.get_rank() == 0): if global_step % args.print_every == -1 % args.print_every: logger.info("epoch %d, step %d, loss %.3f, acc %.3f", epoch, global_step, tr_stat['loss'] / tr_stat['nsamples'], tr_stat['acc'] / tr_stat['nsamples']) tr_stat = Statistics() if global_step > args.warmup_steps and global_step % args.eval_every == -1 % args.eval_every: dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, addition=args.additional_negs) acc = validate(model, dev_data, device) logger.info("epoch %d, step %d, dev, dev acc %.2f", epoch, global_step, acc) save_path = '%s/epoch%d_batch%d_acc%.2f' % ( args.ckpt, epoch, global_step, acc) model.save(args, save_path) model.train() if global_step > args.total_train_steps: break epoch += 1 logger.info('rank %d, finish training after %d steps', local_rank, global_step)