print 'Done.\n' try: # This metadata is attached to all instances to allow cleanup to find # stale instances made by this utility instance_metadata = { 'pulp_instance': 'True', 'build_time': str(time.time()), } print 'Deploying instances...' os1.build_instances(config, instance_metadata) # Save the configuration for cleanup immediately since the configuration can fail if args.deployed_config is None: args.deployed_config = args.config[0] + '.json' config_utils.save_config(config, args.deployed_config) print 'Applying role-specific configurations...' setup_utils.configure_instances(config) # Print out machine information and configuration print '\nThe following instances have been built:' for instance in config_utils.config_generator(config): print """ Instance name: %(instance_name)s Role: %(role)s SSH: %(host_string)s """ % instance print 'The configuration file has been written to ' + args.deployed_config except (Exception, KeyboardInterrupt), e: # Print exception message and quit
def main(_): log_dir = FLAGS.model_dir if not os.path.exists(log_dir): os.makedirs(log_dir) path_prefix = log_dir + "/" log_file_path = path_prefix + "log.txt" print('Log file path: {}'.format(log_file_path)) log_file = open(log_file_path, 'wt') log_file.write("{}\n".format(FLAGS)) log_file.flush() # save configuration config_utils.save_config(FLAGS, path_prefix + "config.json") print('Loading train set.') if FLAGS.data_split == 1: train_set, train_question_len = read_data_split_1( FLAGS.s1_train_path, isLower=FLAGS.isLower) else: train_set, train_question_len = read_data_split_2( FLAGS.s2_train_path, isLower=FLAGS.isLower) print('Number of training samples: {}'.format(len(train_set))) print('Loading test set.') if FLAGS.data_split == 1: dev_set, dev_question_len = read_data_split_1(FLAGS.s1_dev_path, isLower=FLAGS.isLower) else: dev_set, dev_question_len = read_data_split_2(FLAGS.s2_dev_path, isLower=FLAGS.isLower) print('Number of test samples: {}'.format(len(dev_set))) max_actual_len = max(train_question_len, dev_question_len) print('Max answer length: {}, truncated to {}'.format( max_actual_len, FLAGS.max_question_len)) word_vocab = None POS_vocab = None NER_vocab = None has_pretrained_model = False best_path = path_prefix + "best.model" if os.path.exists(best_path + ".index"): has_pretrained_model = True print('There is an existing pretrained model. Loading vocabs:') if FLAGS.with_word: word_vocab = Vocab(embedding_path=FLAGS.word_vec_path) print('word_vocab: {}'.format(word_vocab.word_vecs.shape)) if FLAGS.with_POS: POS_vocab = Vocab( embedding_path=os.path.join(path_prefix, "POS_vocab")) print('POS_vocab: {}'.format(POS_vocab.word_vecs.shape)) if FLAGS.with_NER: NER_vocab = Vocab( embedding_path=os.path.join(path_prefix, "NER_vocab")) print('NER_vocab: {}'.format(NER_vocab.word_vecs.shape)) else: print('Collecting vocabs.') (allWords, allPOSs, allNERs) = collect_vocabs(train_set) print('Number of words: {}'.format(len(allWords))) print('Number of allPOSs: {}'.format(len(allPOSs))) print('Number of allNERs: {}'.format(len(allNERs))) if FLAGS.with_word: word_vocab = Vocab(embedding_path=FLAGS.word_vec_path) if FLAGS.with_POS: POS_vocab = Vocab(vocab=allPOSs, dim=FLAGS.POS_dim) POS_vocab.dump_to_txt(os.path.join(path_prefix, "POS_vocab")) if FLAGS.with_NER: NER_vocab = Vocab(vocab=allNERs, dim=FLAGS.NER_dim) NER_vocab.dump_to_txt(os.path.join(path_prefix, "NER_vocab")) print('word vocab size {}'.format(word_vocab.vocab_size)) sys.stdout.flush() print('Build data loaders ... ') train_data_loader = QGDataLoader(train_set, word_vocab, POS_vocab, NER_vocab, flags=FLAGS, isShuffle=True, isLoop=True, isSort=True) dev_data_loader = QGDataLoader(dev_set, word_vocab, POS_vocab, NER_vocab, flags=FLAGS, isShuffle=False, isLoop=False, isSort=True) print('Number of instances in train data loader: {}'.format( train_data_loader.get_num_instance())) print('Number of instances in dev data loader: {}'.format( dev_data_loader.get_num_instance())) sys.stdout.flush() # initialize the best bleu and accu scores for current training session best_accu = FLAGS.best_accu if 'best_accu' in FLAGS.__dict__ else 0.0 best_bleu = FLAGS.best_bleu if 'best_bleu' in FLAGS.__dict__ else 0.0 if best_accu > 0.0: print('With initial dev accuracy {}'.format(best_accu)) if best_bleu > 0.0: print('With initial dev BLEU score {}'.format(best_bleu)) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-0.01, 0.01) with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=None, initializer=initializer): train_graph = ModelGraph(word_vocab=word_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab, flags=FLAGS, mode=FLAGS.mode) assert FLAGS.mode in ('ce_train', 'rl_train', 'rl_ce_train') valid_mode = 'evaluate' if FLAGS.mode == 'ce_train' else 'evaluate_bleu' with tf.name_scope("Valid"): with tf.variable_scope("Model", reuse=True, initializer=initializer): valid_graph = ModelGraph(word_vocab=word_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab, flags=FLAGS, mode=valid_mode) initializer = tf.global_variables_initializer() _vars = {} for var in tf.all_variables(): if "word_embedding" in var.name: continue if not var.name.startswith("Model"): continue _vars[var.name.split(":")[0]] = var saver = tf.train.Saver(_vars) config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=config) sess.run(initializer) if has_pretrained_model: print("Restoring model from " + best_path) saver.restore(sess, best_path) print("DONE!") if FLAGS.mode in ('rl_train', 'rl_ce_train') and abs(best_bleu) < 0.00001: print("Getting BLEU score for the model") best_bleu = evaluate(sess, valid_graph, dev_data_loader, flags=FLAGS)['dev_bleu'] FLAGS.best_bleu = best_bleu config_utils.save_config(FLAGS, path_prefix + "config.json") print('BLEU = %.4f' % best_bleu) log_file.write('BLEU = %.4f\n' % best_bleu) if FLAGS.mode == 'ce_train' and abs(best_accu) < 0.00001: print("Getting ACCU score for the model") best_accu = evaluate(sess, valid_graph, dev_data_loader, flags=FLAGS)['dev_accu'] FLAGS.best_accu = best_accu config_utils.save_config(FLAGS, path_prefix + "config.json") print('ACCU = %.4f' % best_accu) log_file.write('ACCU = %.4f\n' % best_accu) print('Start the training loop.') train_size = train_data_loader.get_num_batch() max_steps = train_size * FLAGS.n_epochs total_loss = 0.0 start_time = time.time() for step in xrange(max_steps): cur_batch = train_data_loader.nextBatch() if FLAGS.mode == 'rl_train': loss_value = train_graph.rl_train(sess, cur_batch, with_ce=False) elif FLAGS.mode == 'rl_ce_train': loss_value = train_graph.rl_train(sess, cur_batch, with_ce=True) elif FLAGS.mode == 'ce_train': loss_value = train_graph.ce_train(sess, cur_batch) total_loss += loss_value if step % 100 == 0: print('{} '.format(step), end="") sys.stdout.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % train_data_loader.get_num_batch() == 0 or ( step + 1) == max_steps: duration = time.time() - start_time print('Step %d: loss = %.2f (%.3f sec)' % (step, total_loss, duration)) log_file.write('Step %d: loss = %.2f (%.3f sec)\n' % (step, total_loss, duration)) log_file.flush() sys.stdout.flush() total_loss = 0.0 # Evaluate against the validation set. start_time = time.time() sess.run(train_graph.ema_to_vars_op) res_dict = evaluate(sess, valid_graph, dev_data_loader, flags=FLAGS, suffix=str(step)) if valid_graph.mode == 'evaluate': dev_loss = res_dict['dev_loss'] dev_accu = res_dict['dev_accu'] dev_right = int(res_dict['dev_right']) dev_total = int(res_dict['dev_total']) print('Dev loss = %.4f' % dev_loss) log_file.write('Dev loss = %.4f\n' % dev_loss) print('Dev accu = %.4f %d/%d' % (dev_accu, dev_right, dev_total)) log_file.write('Dev accu = %.4f %d/%d\n' % (dev_accu, dev_right, dev_total)) log_file.flush() if best_accu < dev_accu: print('Saving weights, ACCU {} (prev_best) < {} (cur)'. format(best_accu, dev_accu)) saver.save(sess, best_path) best_accu = dev_accu FLAGS.best_accu = dev_accu config_utils.save_config(FLAGS, path_prefix + "config.json") else: dev_bleu = res_dict['dev_bleu'] print('Dev bleu = %.4f' % dev_bleu) log_file.write('Dev bleu = %.4f\n' % dev_bleu) log_file.flush() if best_bleu < dev_bleu: print('Saving weights, BLEU {} (prev_best) < {} (cur)'. format(best_bleu, dev_bleu)) saver.save(sess, best_path) best_bleu = dev_bleu FLAGS.best_bleu = dev_bleu config_utils.save_config(FLAGS, path_prefix + "config.json") sess.run(train_graph.restore_backup_vars_op) duration = time.time() - start_time print('Duration %.3f sec' % (duration)) sys.stdout.flush() log_file.write('Duration %.3f sec\n' % (duration)) log_file.flush() log_file.close()
def main(argv): del argv # Unused. params = factory.config_generator(FLAGS.model) if FLAGS.config_file: params = params_dict.override_params_dict( params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict( params, FLAGS.params_override, is_strict=True) if not FLAGS.use_tpu: params.override({ 'architecture': { 'use_bfloat16': False, }, 'batch_norm_activation': { 'use_sync_bn': False, }, }, is_strict=True) params.override({ 'platform': { 'eval_master': FLAGS.eval_master, 'tpu': FLAGS.tpu, 'tpu_zone': FLAGS.tpu_zone, 'gcp_project': FLAGS.gcp_project, }, 'tpu_job_name': FLAGS.tpu_job_name, 'use_tpu': FLAGS.use_tpu, 'model_dir': FLAGS.model_dir, 'train': { 'num_shards': FLAGS.num_cores, }, }, is_strict=False) # Only run spatial partitioning in training mode. if FLAGS.mode != 'train': params.train.input_partition_dims = None params.train.num_cores_per_replica = None params.validate() params.lock() pp = pprint.PrettyPrinter() params_str = pp.pformat(params.as_dict()) logging.info('Model Parameters: %s', params_str) # Builds detection model on TPUs. model_fn = model_builder.ModelFn(params) executor = tpu_executor.TpuExecutor(model_fn, params) # Prepares input functions for train and eval. train_input_fn = input_reader.InputFn( params.train.train_file_pattern, params, mode=ModeKeys.TRAIN, dataset_type=params.train.train_dataset_type) if params.eval.type == 'customized': eval_input_fn = input_reader.InputFn( params.eval.eval_file_pattern, params, mode=ModeKeys.EVAL, dataset_type=params.eval.eval_dataset_type) else: eval_input_fn = input_reader.InputFn( params.eval.eval_file_pattern, params, mode=ModeKeys.PREDICT_WITH_GT, dataset_type=params.eval.eval_dataset_type) # Runs the model. if FLAGS.mode == 'train': config_utils.save_config(params, params.model_dir) executor.train(train_input_fn, params.train.total_steps) if FLAGS.eval_after_training: executor.evaluate( eval_input_fn, params.eval.eval_samples // params.eval.eval_batch_size) elif FLAGS.mode == 'eval': def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', params.eval.eval_timeout) return True # Runs evaluation when there's a new checkpoint. for ckpt in tf.train.checkpoints_iterator( params.model_dir, min_interval_secs=params.eval.min_eval_interval, timeout=params.eval.eval_timeout, timeout_fn=terminate_eval): # Terminates eval job when final checkpoint is reached. current_step = int(os.path.basename(ckpt).split('-')[1]) logging.info('Starting to evaluate.') try: executor.evaluate( eval_input_fn, params.eval.eval_samples // params.eval.eval_batch_size, ckpt) if current_step >= params.train.total_steps: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info('Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': config_utils.save_config(params, params.model_dir) num_cycles = int(params.train.total_steps / params.eval.num_steps_per_eval) for cycle in range(num_cycles): logging.info('Start training cycle %d.', cycle) current_cycle_last_train_step = ((cycle + 1) * params.eval.num_steps_per_eval) executor.train(train_input_fn, current_cycle_last_train_step) executor.evaluate( eval_input_fn, params.eval.eval_samples // params.eval.eval_batch_size) elif FLAGS.mode == 'predict': file_pattern = FLAGS.predict_file_pattern if not file_pattern: raise ValueError('"predict_file_pattern" parameter is required.') output_dir = FLAGS.predict_output_dir if not output_dir: raise ValueError('"predict_output_dir" parameter is required.') test_input_fn = input_reader.InputFn( file_pattern, params, mode=ModeKeys.PREDICT_WITH_GT, dataset_type=params.eval.eval_dataset_type) checkpoint_prefix = 'model.ckpt-' + FLAGS.predict_checkpoint_step checkpoint_path = os.path.join(FLAGS.model_dir, checkpoint_prefix) if not tf.train.checkpoint_exists(checkpoint_path): checkpoint_path = os.path.join(FLAGS.model_dir, 'best_checkpoints', checkpoint_prefix) if not tf.train.checkpoint_exists(checkpoint_path): raise ValueError('Checkpoint not found: %s/%s' % (FLAGS.model_dir, checkpoint_prefix)) executor.predict(test_input_fn, checkpoint_path, output_dir=output_dir) else: logging.info('Mode not found.')
def main(argv): del argv # Unused. params = factory.config_generator(FLAGS.model) if FLAGS.config_file: params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params.override({ 'use_tpu': FLAGS.use_tpu, 'model_dir': FLAGS.model_dir, }, is_strict=True) if not FLAGS.use_tpu: params.override( { 'architecture': { 'use_bfloat16': False, }, 'batch_norm_activation': { 'use_sync_bn': False, }, }, is_strict=True) # Only run spatial partitioning in training mode. if FLAGS.mode != 'train': params.train.input_partition_dims = None params.train.num_cores_per_replica = None params_to_save = params_dict.ParamsDict(params) params.override( { 'platform': { 'eval_master': FLAGS.eval_master, 'tpu': FLAGS.tpu, 'tpu_zone': FLAGS.tpu_zone, 'gcp_project': FLAGS.gcp_project, }, 'tpu_job_name': FLAGS.tpu_job_name, 'train': { 'num_shards': FLAGS.num_cores, }, }, is_strict=False) params.validate() params.lock() pp = pprint.PrettyPrinter() params_str = pp.pformat(params.as_dict()) logging.info('Model Parameters: %s', params_str) # Builds detection model on TPUs. model_fn = model_builder.ModelFn(params) executor = tpu_executor.TpuExecutor(model_fn, params) # Prepares input functions for train and eval. train_input_fn = input_reader.InputFn( params.train.train_file_pattern, params, mode=ModeKeys.TRAIN, dataset_type=params.train.train_dataset_type) if params.eval.type == 'customized': eval_input_fn = input_reader.InputFn( params.eval.eval_file_pattern, params, mode=ModeKeys.EVAL, dataset_type=params.eval.eval_dataset_type) else: eval_input_fn = input_reader.InputFn( params.eval.eval_file_pattern, params, mode=ModeKeys.PREDICT_WITH_GT, dataset_type=params.eval.eval_dataset_type) if params.eval.eval_samples: eval_times = params.eval.eval_samples // params.eval.eval_batch_size else: eval_times = None # Runs the model. if FLAGS.mode == 'train': config_utils.save_config(params_to_save, params.model_dir) executor.train(train_input_fn, params.train.total_steps) if FLAGS.eval_after_training: executor.evaluate(eval_input_fn, eval_times) elif FLAGS.mode == 'eval': def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', params.eval.eval_timeout) return True # Runs evaluation when there's a new checkpoint. for ckpt in tf.train.checkpoints_iterator( params.model_dir, min_interval_secs=params.eval.min_eval_interval, timeout=params.eval.eval_timeout, timeout_fn=terminate_eval): # Terminates eval job when final checkpoint is reached. current_step = int( six.ensure_str(os.path.basename(ckpt)).split('-')[1]) logging.info('Starting to evaluate.') try: executor.evaluate(eval_input_fn, eval_times, ckpt) if current_step >= params.train.total_steps: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError as e: logging.info( 'Erorr occurred during evaluation: NotFoundError: %s', e) elif FLAGS.mode == 'train_and_eval': config_utils.save_config(params_to_save, params.model_dir) num_cycles = int(params.train.total_steps / params.eval.num_steps_per_eval) for cycle in range(num_cycles): logging.info('Start training cycle %d.', cycle) current_cycle_last_train_step = ((cycle + 1) * params.eval.num_steps_per_eval) executor.train(train_input_fn, current_cycle_last_train_step) executor.evaluate(eval_input_fn, eval_times) else: logging.info('Mode not found.')