def main(unused_argv): if len( unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if not os.path.exists( os.path.join(FLAGS.data_root, FLAGS.dataset_name)) or len( os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: print( 'No TF example data found at %s so creating it from raw data.' % os.path.join( FLAGS.data_root, FLAGS.dataset_name)) convert_data.process_dataset(FLAGS.dataset_name) logging.set_verbosity(logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key, val in FLAGS.__flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) if FLAGS.pg_mmr or FLAGS.pg_mmr_sim or FLAGS.pg_mmr_diff: # Fit the TFIDF vectorizer if not already fitted if FLAGS.importance_fn == 'tfidf': tfidf_model_path = os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer', FLAGS.dataset_name + '.dill') if not os.path.exists(tfidf_model_path): print( 'No TFIDF vectorizer model file found at %s, so fitting the model now.' % tfidf_model_path) tfidf_vectorizer = fit_tfidf_vectorizer(hps, vocab) with open(tfidf_model_path, 'wb') as f: dill.dump(tfidf_vectorizer, f) # Train the SVR model on the CNN validation set if not already trained if FLAGS.importance_fn == 'svr': save_path = os.path.join(FLAGS.data_root, 'svr_training_data') importance_model_path = os.path.join(FLAGS.actual_log_root, 'svr.pickle') dataset_split = 'val' if not os.path.exists(importance_model_path): if not os.path.exists(save_path) or len( os.listdir(save_path)) == 0: print( 'No importance_feature instances found at %s so creating it from raw data.' % save_path) decode_model_hps = hps._replace( max_dec_steps=1, batch_size=100, mode='calc_features') # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries cnn_dm_train_data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, dataset_split + '*') batcher = Batcher(cnn_dm_train_data_path, vocab, decode_model_hps, single_pass=FLAGS.single_pass, cnn_500_dm_500=False) calc_features(cnn_dm_train_data_path, decode_model_hps, vocab, batcher, save_path) print( 'No importance_feature SVR model found at %s so training it now.' % importance_model_path) features_list = importance_features.get_features_list(True) sent_reps = importance_features.load_data( os.path.join(save_path, dataset_split + '*'), -1) print 'Loaded %d sentences representations' % len(sent_reps) x_y = importance_features.features_to_array(sent_reps, features_list) train_x, train_y = x_y[:, :-1], x_y[:, -1] svr_model = importance_features.run_training(train_x, train_y) with open(importance_model_path, 'wb') as f: cPickle.dump(svr_model, f) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness # Start decoding on multi-document inputs if hps.mode == 'decode': decode_model_hps = hps._replace( max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) # if '_sent' in FLAGS.dataset_name: # FLAGS.data_root = os.path.expanduser('~') + '/data/tf_data/with_coref_and_tag_tokens' if FLAGS.pg_mmr: FLAGS.data_root = os.path.expanduser('~') + "/data/tf_data/with_coref_and_ssi" if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if FLAGS.dataset_name in kaiqiang_dataset_names: FLAGS.skip_with_less_than_3 = False if not os.path.exists(os.path.join(FLAGS.data_root, FLAGS.dataset_name)) or len(os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: print(('No TF example data found at %s so creating it from raw data.' % os.path.join(FLAGS.data_root, FLAGS.dataset_name))) convert_data.process_dataset(FLAGS.dataset_name) if FLAGS.mode == 'decode': extractor = '_bert' if FLAGS.use_bert else '_lambdamart' FLAGS.use_pretrained = True FLAGS.single_pass = True else: extractor = '' pretrained_dataset = FLAGS.dataset_name if FLAGS.dataset_name == 'duc_2004': pretrained_dataset = 'cnn_dm' if FLAGS.pg_mmr: FLAGS.exp_name += '_pgmmr' if FLAGS.singles_and_pairs == 'both': FLAGS.exp_name = FLAGS.exp_name + extractor + '_both' if FLAGS.mode == 'decode': FLAGS.pretrained_path = os.path.join(FLAGS.log_root, pretrained_dataset + '_pgmmr_both') dataset_articles = FLAGS.dataset_name elif FLAGS.singles_and_pairs == 'singles': FLAGS.exp_name = FLAGS.exp_name + extractor + '_singles' if FLAGS.mode == 'decode': FLAGS.pretrained_path = os.path.join(FLAGS.log_root, pretrained_dataset + '_pgmmr_singles') dataset_articles = FLAGS.dataset_name + '_singles' if FLAGS.notrain: FLAGS.exp_name += '_notrain' FLAGS.pretrained_path = original_pretrained_path[FLAGS.dataset_name] if FLAGS.finetune: FLAGS.exp_name += '_finetune' if FLAGS.mode == 'decode': FLAGS.pretrained_path += '_finetune' if FLAGS.sep: FLAGS.exp_name += '_sep' if FLAGS.tag_tokens: FLAGS.exp_name += '_tag' extractor = 'bert' if FLAGS.use_bert else 'lambdamart' bert_suffix = '' # if FLAGS.use_bert: # if FLAGS.sentemb: # bert_suffix += '_sentemb' # if FLAGS.artemb: # bert_suffix += '_artemb' # if FLAGS.plushidden: # bert_suffix += '_plushidden' # if FLAGS.mode == 'decode': # if FLAGS.sentemb: # FLAGS.exp_name += '_sentemb' # if FLAGS.artemb: # FLAGS.exp_name += '_artemb' # if FLAGS.plushidden: # FLAGS.exp_name += '_plushidden' if FLAGS.upper_bound: FLAGS.exp_name = FLAGS.exp_name + '_upperbound' ssi_list = None # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth) else: if FLAGS.mode == 'decode': my_log_dir = os.path.join(log_dir, '%s_%s_%s%s' % (FLAGS.dataset_name, extractor, FLAGS.singles_and_pairs, bert_suffix)) FLAGS.ssi_data_path = my_log_dir logging.set_verbosity(logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if FLAGS.convert_to_importance_model: convert_to_importance_model() # FLAGS.convert_to_coverage_model = True if FLAGS.word_imp_reg: if FLAGS.coverage: raise Exception('Importance loss does not work at the same time with coverage loss yet. Need to modify the total_loss in model.py.') FLAGS.log_root += '_imp' + str(FLAGS.imp_loss_wt) if FLAGS.imp_loss_oneminus: FLAGS.log_root += '_oneminus' print(util.bcolors.OKGREEN + "Experiment path: " + FLAGS.log_root + util.bcolors.ENDC) if FLAGS.dataset_name == 'duc_2004': vocab = Vocab(FLAGS.vocab_path + '_' + 'cnn_dm', FLAGS.vocab_size, add_sep=FLAGS.sep) # create a vocabulary else: vocab_datasets = [os.path.basename(file_path).split('vocab_')[1] for file_path in glob.glob(FLAGS.vocab_path + '_*')] original_dataset_name = [file_name for file_name in vocab_datasets if file_name in FLAGS.dataset_name] if len(original_dataset_name) > 1: raise Exception('Too many choices for vocab file') if len(original_dataset_name) < 1: raise Exception('No vocab file for dataset created. Run make_vocab.py --dataset_name=<my original dataset name>') original_dataset_name = original_dataset_name[0] FLAGS.original_dataset_name = original_dataset_name vocab = Vocab(FLAGS.vocab_path + '_' + original_dataset_name, FLAGS.vocab_size, add_sep=FLAGS.sep) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs # hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', # 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', # 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen', 'lambdamart_input', 'pg_mmr', 'singles_and_pairs', 'skip_with_less_than_3', 'ssi_data_path', # 'dataset_name', 'word_imp_reg', 'imp_loss_wt', 'tag_tokens'] hparam_list = [item for item in list(FLAGS.flag_values_dict().keys()) if item != '?'] hps_dict = {} for key,val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", list(hps_dict.keys()))(**hps_dict) if FLAGS.pg_mmr: # Fit the TFIDF vectorizer if not already fitted if FLAGS.importance_fn == 'tfidf': tfidf_model_path = os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer', FLAGS.original_dataset_name + '.dill') if not os.path.exists(tfidf_model_path): print(('No TFIDF vectorizer model file found at %s, so fitting the model now.' % tfidf_model_path)) tfidf_vectorizer = fit_tfidf_vectorizer(hps, vocab) with open(tfidf_model_path, 'wb') as f: dill.dump(tfidf_vectorizer, f) # Train the SVR model on the CNN validation set if not already trained if FLAGS.importance_fn == 'svr': save_path = os.path.join(FLAGS.data_root, 'svr_training_data') importance_model_path = os.path.join(FLAGS.actual_log_root, 'svr.pickle') dataset_split = 'val' if not os.path.exists(importance_model_path): if not os.path.exists(save_path) or len(os.listdir(save_path)) == 0: print(('No importance_feature instances found at %s so creating it from raw data.' % save_path)) decode_model_hps = hps._replace( max_dec_steps=1, batch_size=100, mode='calc_features') # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries cnn_dm_train_data_path = os.path.join(FLAGS.data_root, 'cnn_500_dm_500', dataset_split + '*') batcher = Batcher(cnn_dm_train_data_path, vocab, decode_model_hps, single_pass=FLAGS.single_pass, cnn_500_dm_500=True) calc_features(cnn_dm_train_data_path, decode_model_hps, vocab, batcher, save_path) print(('No importance_feature SVR model found at %s so training it now.' % importance_model_path)) features_list = importance_features.get_features_list(True) sent_reps = importance_features.load_data(os.path.join(save_path, dataset_split + '*'), -1) print('Loaded %d sentences representations' % len(sent_reps)) x_y = importance_features.features_to_array(sent_reps, features_list) train_x, train_y = x_y[:,:-1], x_y[:,-1] svr_model = importance_features.run_training(train_x, train_y) with open(importance_model_path, 'wb') as f: pickle.dump(svr_model, f) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(113) # a seed value for randomness # Start decoding on multi-document inputs if hps.mode == 'train': print("creating model...") model = SummarizationModel(hps, vocab) setup_training(model, batcher) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode == 'decode': decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) # while True: # a=0 else: raise ValueError("The 'mode' flag must be one of train/eval/decode")