def _init_models(args): if args.multi_env is not None: assert len(args.multi_demos) == len(args.multi_episodes) args.model = args.model or ImitationLearning.default_model_name(args) utils.configure_logging(args.model) logger = logging.getLogger(__name__) self.il_learn_forward = ImitationLearning(args) self.il_learn_backward = ImitationLearning(args) # Define logger and Tensorboard writer self.header = ([ "update", "frames", "FPS", "duration", "entropy", "policy_loss", "train_accuracy" ] + ["validation_accuracy"]) if args.multi_env is None: self.header.extend( ["validation_return", "validation_success_rate"]) else: self.header.extend( ["validation_return_{}".format(env) for env in args.multi_env]) self.header.extend([ "validation_success_rate_{}".format(env) for env in args.multi_env ]) writer = None if args.tb: from tensorboardX import SummaryWriter writer = SummaryWriter(utils.get_log_dir(args.model)) # Define csv writer selof.csv_writer = None self.csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv') first_created = not os.path.exists(self.csv_path) # we don't buffer data going in the csv log, cause we assume # that one update will take much longer that one write to the log self.csv_writer = self.csv.writer(open(self.csv_path, 'a', 1)) if first_created: self.csv_writer.writerow(self.header) # Get the status path self.status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') # Log command, availability of CUDA, and model logger.info(args) logger.info("CUDA available: {}".format(torch.cuda.is_available())) logger.info(self.il_learn_forward.acmodel)
def main(argv=None): FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string( 'training_data', '../data/tfrecords/jet_training_8101_pT-ALL_eta-ALL_Pythia.tfrecords', 'the training data set' ) tf.app.flags.DEFINE_string( 'validation_data', '../data/tfrecords/jet_validation_2701_pT-ALL_eta-ALL_Pythia.tfrecords', 'the validation data set' ) tf.app.flags.DEFINE_integer('batch_size', 500, 'batch size') tf.app.flags.DEFINE_integer('num_epochs', 30, 'the number of epochs') log_dir = get_log_dir(dname='test', creation=True) train( tfrecords_path=FLAGS.training_data, tfevents_dir=log_dir.tfevents.training.path, ckpt_dir=log_dir.ckpt.path, benchmark_path=log_dir.path, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs ) evaluate( training_data=FLAGS.training_data, validation_data=FLAGS.validation_data, log_dir=log_dir ) draw_all_qg_histograms(qg_histogram_dir=log_dir.qg_histogram)
def __init__(self, name = "messages", show_date = True, separator = " ", extension = "txt"): """ Initializes a new logger object. The defaults are for status and error message logging, but the if a new name is used, there will be a new directory in the log directory that will store the data made by the logger with that name Arguments: name: The logger's name. All logs will go into a directory with the logger's name show_date: Whether or not a date should be printed with every log message separator: The delimiter for all strings passed to the logger in a log command extension: The file-type for the log messages. Defaults to '.txt'. """ self.name = name self.directory = utils.get_log_dir() self.show_date = show_date self.separator = separator self.extension = extension #check for log directory, create if necessary if not self.directory: new_dir = str.replace(utils.get_resource_files_prefix(), "resources", "log") os.mkdir(new_dir) #check for log/name directory, create if doesn't exist try: os.makedirs("%s%s/" %(self.directory, self.name)) except OSError: if not os.path.isdir("%s%s/" %(self.directory, self.name)): raise #check for name directory in log, create if necessary self.log("logger created")
def build_codebook_from_name(experiment_name, experiment_group='', return_dataset=False, return_decoder=False): import os import configparser workspace_path = os.environ.get('AE_WORKSPACE_PATH') if workspace_path == None: print 'Please define a workspace path:\n' print 'export AE_WORKSPACE_PATH=/path/to/workspace\n' exit(-1) import utils as u import tensorflow as tf log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group) checkpoint_file = u.get_checkpoint_basefilename(log_dir) cfg_file_path = u.get_train_config_exp_file_path(log_dir, experiment_name) dataset_path = u.get_dataset_path(workspace_path) if os.path.exists(cfg_file_path): args = configparser.ConfigParser() args.read(cfg_file_path) else: print 'ERROR: Config File not found: ', cfg_file_path exit() with tf.variable_scope(experiment_name): dataset = build_dataset(dataset_path, args) x = tf.placeholder(tf.float32, [ None, ] + list(dataset.shape)) encoder = build_encoder(x, args) codebook = build_codebook(encoder, dataset, args) if return_decoder: reconst_target = tf.placeholder(tf.float32, [ None, ] + list(dataset.shape)) decoder = build_decoder(reconst_target, encoder, args) if return_dataset: if return_decoder: return codebook, dataset, decoder else: return codebook, dataset else: return codebook
def main(): parser = argparse.ArgumentParser() parser.add_argument('--log_dir', type=str, required=True, help='the directory path of dataset') args = parser.parse_args() log_dir = get_log_dir(path=args.log_dir, creation=False) path_and_step = get_saved_model_paths(log_dir.saved_models.path) for i, (saved_model_path, step) in enumerate(path_and_step): print("\n\n\n[{i}/{total}]: {path}".format(i=i, total=len(path_and_step), path=saved_model_path)) evaluate(saved_model_path, step, log_dir)
def __init__(self, adj_mx, **kwargs): self._kwargs = kwargs self._data_kwargs = kwargs.get('data') self._model_kwargs = kwargs.get('model') self._train_kwargs = kwargs.get('train') self.dataset_name = self._data_kwargs['dataset_dir'].split('/')[-1] self.adj_mx = adj_mx self.model_params = dict() self.model_params['seq_len'] = 30 self.K = [1, 5, 10, 20, 50, 100] model_name = 'net_act_orig' # self._kwargs['model_name'] self.log_file_name = utils.get_log_dir(log_dir=self._kwargs['log_dir'], model_name=model_name, dataset_name=self.dataset_name) if not os.path.exists(self._kwargs['save_dir']): os.makedirs(self._kwargs['save_dir']) if not os.path.exists( os.path.join(self._kwargs['save_dir'], self.dataset_name)): os.makedirs( os.path.join(self._kwargs['save_dir'], self.dataset_name)) if not os.path.exists( os.path.join(self._kwargs['save_dir'], self.dataset_name, self._kwargs['model_name'])): os.makedirs( os.path.join(self._kwargs['save_dir'], self.dataset_name, self._kwargs['model_name'])) log_level = self._kwargs.get('log_level', 'INFO') self._logger = utils.get_logger(self.log_file_name, name=__name__, level=log_level) self._writer = tf.summary.FileWriter(self.log_file_name) self._logger.info(json.dumps(kwargs, indent=2)) self._saved_file_name = 'best_model.ckpt' user_id, reverse_user_id, item_id, reverse_item_id = \ utils.load_ids(self._data_kwargs['dataset_dir'], self._data_kwargs['ids_file_name']) print(len(user_id), len(reverse_user_id), len(item_id), len(reverse_item_id)) self.n_users = len(user_id) self.n_context = self._model_kwargs['context_size'] data_examples, self.user_history, num_bins = utils.load_dataset_timestamp( self._data_kwargs['dataset_dir'], self._data_kwargs['dataset_name'], self.n_users, self.n_context, self.model_params['seq_len']) self.num_bins = num_bins self.model_params['batch_size'] = self._data_kwargs['batch_size'] self.model_params['user_size'] = self.n_users self.model_params['item_size'] = len(item_id) self.model_params['state_size'] = self._model_kwargs['state_size'] self.model_params['emb_size'] = self._model_kwargs['emb_size'] self.model_params['lr'] = self._train_kwargs['base_lr'] self.model_params['n_bins'] = self.num_bins self.model_params['context_size'] = self.n_context self.model_params['start_lr'] = len( data_examples) // self._data_kwargs['batch_size'] self.model_params['min_lr'] = self._train_kwargs['min_learning_rate'] self.model_params['use_attn'] = self._model_kwargs['use_attn'] self.model_params['normalize'] = self._model_kwargs['normalize'] self.model_params['max_diff'] = self._model_kwargs['max_diff'] if self._model_kwargs['n_samples'] == -1: self.model_params['n_samples'] = len(item_id) else: self.model_params['n_samples'] = self._model_kwargs['n_samples'] self.model_params['comb'] = self._model_kwargs['comb'] self.data_iterator = utils.Loader(data_examples, options=self.model_params)
def finetune(args, train_valid_datasets_provider, model_kwargs, forward_step=finetune_forward_step, end_of_epoch_callback_provider=None): """Main finetune function used across all tasks.""" global tokenizer timers = Timers() tokenizer = prepare_tokenizer(args) pretrain_glm.tokenizer = tokenizer if args.save: args.save = os.path.join(args.save, args.experiment_name) # Train and validation data loaders. timers('train/valid/test dataset/dataloder').start() train_dataloader, valid_dataloader = None, None train_block_dataloader, valid_block_dataloader = None, None if train_valid_datasets_provider is not None and args.epochs > 0: if mpu.get_model_parallel_rank() == 0: train_dataset, valid_dataset = train_valid_datasets_provider( args, tokenizer) train_dataloader, valid_dataloader = _build_train_valid_dataloaders( train_dataset, valid_dataset, args) if args.no_validation: valid_dataloader = None train_iters = torch.cuda.LongTensor([len(train_dataloader)]) else: train_iters = torch.cuda.LongTensor([0]) torch.distributed.broadcast(train_iters, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) if mpu.get_model_parallel_rank() != 0: args.train_iters_per_epoch = train_iters[0].item() args.train_iters = args.epochs * args.train_iters_per_epoch train_dataloader = FakeDataloader(args.train_iters_per_epoch) if args.no_validation: valid_dataloader = None else: valid_dataloader = FakeDataloader(None) if args.block_lm_ratio > 0.0: if mpu.get_model_parallel_rank() == 0: train_block_dataset, valid_block_dataset = train_valid_datasets_provider( args, tokenizer, pattern_text=True) train_block_dataloader = make_data_loader( train_block_dataset, tokenizer, args.batch_size * mpu.get_data_parallel_world_size(), args.train_iters, args, shuffle=True, block_collate=True) valid_block_dataloader = make_data_loader( valid_block_dataset, tokenizer, args.batch_size * mpu.get_data_parallel_world_size(), (args.train_iters // args.eval_interval + 1) * args.eval_iters, args, shuffle=True, block_collate=True) else: train_block_dataloader = FakeDataloader(args.train_iters) valid_block_dataloader = FakeDataloader(None) train_block_dataloader, valid_block_dataloader = iter( train_block_dataloader), iter(valid_block_dataloader) timers('train/valid/test dataset/dataloder').stop() # Build calback function. timers('callback function').start() end_of_epoch_callback, end_of_train_callback = None, None if end_of_epoch_callback_provider is not None: if train_valid_datasets_provider is not None and args.epochs > 0 and not args.no_validation: end_of_epoch_callback = end_of_epoch_callback_provider( args, tokenizer, is_test=False) end_of_train_callback = end_of_epoch_callback_provider(args, tokenizer, is_test=True) timers('callback function').stop() # Build model, optimizer and learning rate scheduler. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer( args, **model_kwargs) timers('model and optimizer').stop() # If pretrained checkpoint is provided and we have not trained for # any iteration (i.e., iteration is zero), then load the pretrained # checkpoint. timers('pretrained checkpoint').start() if args.load_pretrained is not None and not args.pretrained_bert: task_tokens = None if args.continuous_prompt and args.prompt_init: if mpu.get_model_parallel_rank() == 0: dataset = train_dataloader.dataset processor, pvp = dataset.processor, dataset.pvp task_tokens = [] for label in processor.get_labels(): verbalizer = pvp.verbalize(label)[0] verbalizer_ids = tokenizer.EncodeAsIds( verbalizer).tokenization task_tokens += verbalizer_ids print_rank_0("Task tokens: " + tokenizer.DecodeIds(task_tokens)) num_task_tokens = len(task_tokens) else: num_task_tokens, task_tokens = 0, [] num_task_tokens = torch.cuda.LongTensor([num_task_tokens]) torch.distributed.broadcast(num_task_tokens, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_task_tokens = num_task_tokens.item() if num_task_tokens > 0: if mpu.get_model_parallel_rank() == 0: task_tokens = torch.cuda.LongTensor(task_tokens) else: task_tokens = torch.empty( num_task_tokens, device=torch.cuda.current_device(), dtype=torch.long) torch.distributed.broadcast( task_tokens, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) task_tokens = task_tokens.tolist() with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): load_pretrained(model, args.load_pretrained, args, task_tokens=task_tokens) # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16 and optimizer is not None: if args.deepspeed: optimizer.refresh_fp32_params() else: optimizer._model_params_to_master_params() if args.load is not None: with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): load_checkpoint(model, optimizer, lr_scheduler, args, no_deepspeed=args.no_deepspeed_load) # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16 and optimizer is not None: if args.deepspeed: optimizer.refresh_fp32_params() else: optimizer._model_params_to_master_params() torch.distributed.barrier() timers('pretrained checkpoint').stop() args.iteration = 0 summary_writer = None if torch.distributed.get_rank() == 0: args.log_dir = get_log_dir(base=args.summary_dir, name=args.experiment_name) if os.path.exists(os.path.join(args.log_dir, "test_results.json") ) and args.load is None and not args.overwrite: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.log_dir)) summary_writer = get_sample_writer(log_dir=args.log_dir, iteration=args.iteration) print_and_save_args(args, verbose=True, log_dir=args.log_dir) # Print setup timing. print_rank_0('done with setups ...') timers.log([ 'train/valid/test dataset/dataloder', 'callback function', 'model and optimizer', 'pretrained checkpoint' ]) print_rank_0('training ...') # Finetune the model. score_dict = None if train_dataloader is not None and args.epochs > 0: if args.block_lm_ratio > 0.0: forward_step = mix_forward_step best_iteration = _train(model, optimizer, lr_scheduler, forward_step, (train_dataloader, train_block_dataloader), (valid_dataloader, valid_block_dataloader), end_of_epoch_callback, args, timers, summary_writer=summary_writer) if end_of_train_callback is not None and best_iteration is not None: with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): args.load = os.path.join(args.save, "best") load_checkpoint(model, optimizer, lr_scheduler, args, no_load_optim=True, no_deepspeed=True) args.load = None torch.distributed.barrier() if end_of_train_callback is not None: score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) # Or just evaluate. else: if end_of_train_callback is not None: print_rank_0('evaluation only mode, setting epoch to -1') score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) if score_dict is not None and torch.distributed.get_rank() == 0: score_dict.update({"type": "test"}) with open(os.path.join(args.log_dir, "test_results.json"), "w") as output: output.write(json.dumps(score_dict) + "\n") print_rank_0('done :-)')
def main(): workspace_path = os.environ.get('AE_WORKSPACE_PATH') if workspace_path == None: print 'Please define a workspace path:\n' print 'export AE_WORKSPACE_PATH=/path/to/workspace\n' exit(-1) gentle_stop = np.array((1, ), dtype=np.bool) gentle_stop[0] = False def on_ctrl_c(signal, frame): gentle_stop[0] = True signal.signal(signal.SIGINT, on_ctrl_c) parser = argparse.ArgumentParser() parser.add_argument("experiment_name") parser.add_argument("-d", action='store_true', default=False) parser.add_argument("-gen", action='store_true', default=False) arguments = parser.parse_args() full_name = arguments.experiment_name.split('/') experiment_name = full_name.pop() experiment_group = full_name.pop() if len(full_name) > 0 else '' debug_mode = arguments.d generate_data = arguments.gen cfg_file_path = u.get_config_file_path(workspace_path, experiment_name, experiment_group) log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group) checkpoint_file = u.get_checkpoint_basefilename(log_dir) ckpt_dir = u.get_checkpoint_dir(log_dir) train_fig_dir = u.get_train_fig_dir(log_dir) dataset_path = u.get_dataset_path(workspace_path) if not os.path.exists(cfg_file_path): print 'Could not find config file:\n' print '{}\n'.format(cfg_file_path) exit(-1) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) if not os.path.exists(train_fig_dir): os.makedirs(train_fig_dir) if not os.path.exists(dataset_path): os.makedirs(dataset_path) args = configparser.ConfigParser() args.read(cfg_file_path) shutil.copy2(cfg_file_path, log_dir) with tf.variable_scope(experiment_name): dataset = factory.build_dataset(dataset_path, args) queue = factory.build_queue(dataset, args) encoder = factory.build_encoder(queue.x, args, is_training=True) decoder = factory.build_decoder(queue.y, encoder, args, is_training=True) ae = factory.build_ae(encoder, decoder, args) codebook = factory.build_codebook(encoder, dataset, args) train_op = factory.build_train_op(ae, args) saver = tf.train.Saver(save_relative_paths=True) num_iter = args.getint( 'Training', 'NUM_ITER') if not debug_mode else np.iinfo(np.int32).max save_interval = args.getint('Training', 'SAVE_INTERVAL') model_type = args.get('Dataset', 'MODEL') if model_type == 'dsprites': dataset.get_sprite_training_images(args) else: dataset.get_training_images(dataset_path, args) dataset.load_bg_images(dataset_path) if generate_data: print 'finished generating synthetic training data for ' + experiment_name print 'exiting...' exit() bar = progressbar.ProgressBar(maxval=num_iter, widgets=[ ' [', progressbar.Timer(), ' | ', progressbar.Counter('%0{}d / {}'.format( len(str(num_iter)), num_iter)), ' ] ', progressbar.Bar(), ' (', progressbar.ETA(), ') ' ]) gpu_options = tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=0.9) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: chkpt = tf.train.get_checkpoint_state(ckpt_dir) if chkpt and chkpt.model_checkpoint_path: saver.restore(sess, chkpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) merged_loss_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph) if not debug_mode: print 'Training with %s model' % args.get( 'Dataset', 'MODEL'), os.path.basename( args.get('Paths', 'MODEL_PATH')) bar.start() queue.start(sess) for i in xrange(ae.global_step.eval(), num_iter): if not debug_mode: sess.run(train_op) if i % 10 == 0: loss = sess.run(merged_loss_summary) summary_writer.add_summary(loss, i) bar.update(i) if (i + 1) % save_interval == 0: saver.save(sess, checkpoint_file, global_step=ae.global_step) this_x, this_y = sess.run([queue.x, queue.y]) reconstr_train = sess.run(decoder.x, feed_dict={queue.x: this_x}) train_imgs = np.hstack( (u.tiles(this_x, 4, 4), u.tiles(reconstr_train, 4, 4), u.tiles(this_y, 4, 4))) cv2.imwrite( os.path.join(train_fig_dir, 'training_images_%s.png' % i), train_imgs * 255) else: this_x, this_y = sess.run([queue.x, queue.y]) reconstr_train = sess.run(decoder.x, feed_dict={queue.x: this_x}) cv2.imshow( 'sample batch', np.hstack((u.tiles(this_x, 3, 3), u.tiles(reconstr_train, 3, 3), u.tiles(this_y, 3, 3)))) k = cv2.waitKey(0) if k == 27: break if gentle_stop[0]: break queue.stop(sess) if not debug_mode: bar.finish() if not gentle_stop[0] and not debug_mode: print 'To create the embedding run:\n' print 'ae_embed {}\n'.format(full_name)
parser = argparse.ArgumentParser() parser.add_argument("experiment_name") parser.add_argument("obj_id") arguments = parser.parse_args() full_name = arguments.experiment_name.split('/') obj_id = arguments.obj_id experiment_name = full_name.pop() experiment_group = full_name.pop() if len(full_name) > 0 else '' cfg_file_path = u.get_config_file_path(path_workspace, experiment_name, experiment_group) list_models = [int(obj_id)] log_dir = u.get_log_dir(path_workspace, experiment_name, experiment_group) ckpt_dir = os.path.join(log_dir, 'checkpoints_lambda{:d}'.format(int(lambda_reconst))) checkpoint_file = u.get_checkpoint_basefilename(ckpt_dir) train_fig_dir = os.path.join( log_dir, 'train_figures_lambda{:d}'.format(int(lambda_reconst))) dataset_path = u.get_dataset_path(path_workspace) print('dataset_path', dataset_path) if not os.path.exists(cfg_file_path): print('Could not find config file:\n') print('{}\n'.format(cfg_file_path)) exit(-1) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir)
help='Learning Rate') parser.add_argument('--w_decay', nargs='?', type=float, default=2e-4, help='Weight Decay') parser.add_argument('--momentum', nargs='?', type=float, default=0.9, help='momentum') parser.add_argument('--lr_decay', nargs='?', type=float, default=1e-1, help='Learning Rate Decay') parser.add_argument('--resume', nargs='?', type=str, default='', help='Resume training') parser.add_argument('--dataset', nargs='?', type=str, default='camvid', help='Dataset to use [\'pascal, camvid, ade20k etc\']') args = parser.parse_args() out = get_log_dir(here, 'bilinearRes') net_name = 'bilinearRes' train(args, out, net_name)
def main(): workspace_path = os.environ.get('AE_WORKSPACE_PATH') if workspace_path == None: print 'Please define a workspace path:\n' print 'export AE_WORKSPACE_PATH=/path/to/workspace\n' exit(-1) parser = argparse.ArgumentParser() parser.add_argument("experiment_name") parser.add_argument('--at_step', default=None, required=False) arguments = parser.parse_args() full_name = arguments.experiment_name.split('/') experiment_name = full_name.pop() experiment_group = full_name.pop() if len(full_name) > 0 else '' at_step = arguments.at_step cfg_file_path = u.get_config_file_path(workspace_path, experiment_name, experiment_group) log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group) checkpoint_file = u.get_checkpoint_basefilename(log_dir) ckpt_dir = u.get_checkpoint_dir(log_dir) dataset_path = u.get_dataset_path(workspace_path) print checkpoint_file print ckpt_dir print '#' * 20 if not os.path.exists(cfg_file_path): print 'Could not find config file:\n' print '{}\n'.format(cfg_file_path) exit(-1) args = configparser.ConfigParser() args.read(cfg_file_path) with tf.variable_scope(experiment_name): dataset = factory.build_dataset(dataset_path, args) queue = factory.build_queue(dataset, args) encoder = factory.build_encoder(queue.x, args) decoder = factory.build_decoder(queue.y, encoder, args) ae = factory.build_ae(encoder, decoder, args) codebook = factory.build_codebook(encoder, dataset, args) saver = tf.train.Saver(save_relative_paths=True) batch_size = args.getint('Training', 'BATCH_SIZE') model = args.get('Dataset', 'MODEL') gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: print ckpt_dir print '#' * 20 factory.restore_checkpoint(sess, saver, ckpt_dir, at_step=at_step) # chkpt = tf.train.get_checkpoint_state(ckpt_dir) # if chkpt and chkpt.model_checkpoint_path: # print chkpt.model_checkpoint_path # saver.restore(sess, chkpt.model_checkpoint_path) # else: # print 'No checkpoint found. Expected one in:\n' # print '{}\n'.format(ckpt_dir) # exit(-1) if model == 'dsprites': codebook.update_embedding_dsprites(sess, args) else: codebook.update_embedding(sess, batch_size) print 'Saving new checkoint ..', saver.save(sess, checkpoint_file, global_step=ae.global_step) print 'done',
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() args.mem_length = args.mem_length if args.transformer_xl else 0 if args.load and not args.new_save_directory: args.experiment_name = os.path.basename(os.path.normpath(args.load)) else: args.experiment_name = args.experiment_name + datetime.now().strftime( "%m-%d-%H-%M") if args.save: args.save = os.path.join(args.save, args.experiment_name) # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # Data stuff. global tokenizer tokenizer = prepare_tokenizer(args) train_data, val_data, test_data, = get_train_val_test_data(args, tokenizer) multi_train_data, multi_val_data = None, None if args.multi_task_ratio > 0.0: multi_train_data, multi_val_data = build_multi_task_dataset( args, tokenizer) # Model, optimizer, and learning rate. model, optimizer, lr_scheduler = setup_model_and_optimizer(args) if args.load is not None: with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): args.iteration = load_checkpoint(model, optimizer, lr_scheduler, args) else: args.iteration = 0 torch.distributed.barrier() if args.switch_linear: lr_scheduler.switch_linear(args) summary_writer = None if torch.distributed.get_rank() == 0: print('Pretrain GPT2 model') args.log_dir = None if args.train_iters > 0: args.log_dir = get_log_dir(base=args.summary_dir, name=args.experiment_name) summary_writer = get_sample_writer(log_dir=args.log_dir, iteration=args.iteration) print_and_save_args(args, verbose=True, log_dir=args.log_dir) # Resume data loader if necessary. if args.resume_dataloader: print_rank_0("Resume dataloader") if train_data is not None: train_data.batch_sampler.start_iter = args.iteration % len( train_data) if val_data is not None: start_iter_val = (args.iteration // args.eval_interval) * args.eval_iters val_data.batch_sampler.start_iter = start_iter_val % len(val_data) if multi_train_data is not None: multi_train_data.batch_sampler.start_iter = int( args.iteration * args.multi_task_ratio) % len(multi_train_data) if multi_val_data is not None: start_iter_val = (args.iteration // args.eval_interval ) * args.eval_iters * args.multi_task_ratio multi_val_data.batch_sampler.start_iter = start_iter_val % len( multi_val_data) if train_data is not None: train_data_iterator = iter(train_data) else: train_data_iterator = None if multi_train_data is not None: multi_train_iterator = iter(multi_train_data) else: multi_train_iterator = None if val_data is not None: val_data_iterator = iter(val_data) else: val_data_iterator = None if multi_val_data is not None: multi_val_iterator = iter(multi_val_data) else: multi_val_iterator = None # TODO: figure out how to properly set this especially when resuming training iteration = 0 if args.train_iters > 0: if args.do_train: with ExitStack() as stack: def save_on_exit(args_, model_, optimizer_, lr_scheduler_): save_checkpoint(args_.iteration, model_, optimizer_, lr_scheduler_, args_) # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler) iteration, skipped = train( model, optimizer, lr_scheduler, (train_data_iterator, multi_train_iterator), (val_data_iterator, multi_val_iterator), timers, args, summary_writer=summary_writer) if args.do_valid: prefix = 'the end of training for val data' val_loss = evaluate_and_print_results( prefix, val_data_iterator, model, args, timers, verbose=False, forward_step_func=forward_step) if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, lr_scheduler, args) if test_data is not None: test_data_iterator = iter(test_data) else: test_data_iterator = None if args.do_test: # Run on test data. prefix = 'the end of training for test data' evaluate_and_print_results(prefix, (test_data_iterator, None), model, args, timers, verbose=True, forward_step_func=forward_step)
def finetune(args, train_valid_datasets_provider, model_kwargs, forward_step=finetune_forward_step, end_of_epoch_callback_provider=None): """Main finetune function used across all tasks.""" global tokenizer timers = Timers() tokenizer = prepare_tokenizer(args) if args.save: args.save = os.path.join(args.save, args.experiment_name) # Train and validation data loaders. timers('train/valid/test dataset/dataloder').start() train_dataloader, valid_dataloader = None, None if train_valid_datasets_provider is not None and args.epochs > 0: train_dataset, valid_dataset = train_valid_datasets_provider( args, tokenizer) train_dataloader, valid_dataloader = _build_train_valid_dataloaders( train_dataset, valid_dataset, args) timers('train/valid/test dataset/dataloder').stop() # Build calback function. timers('callback function').start() end_of_epoch_callback, end_of_train_callback = None, None if end_of_epoch_callback_provider is not None: if train_valid_datasets_provider is not None and args.epochs > 0: end_of_epoch_callback = end_of_epoch_callback_provider( args, tokenizer, is_test=False) end_of_train_callback = end_of_epoch_callback_provider(args, tokenizer, is_test=True) timers('callback function').stop() # Build model, optimizer and learning rate scheduler. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer( args, **model_kwargs) timers('model and optimizer').stop() # If pretrained checkpoint is provided and we have not trained for # any iteration (i.e., iteration is zero), then load the pretrained # checkpoint. timers('pretrained checkpoint').start() if args.load_pretrained is not None and not args.pretrained_bert and not args.load: module = model if isinstance(module, (LocalDDP, TorchDDP)): module = module.module if isinstance(module, FP16_Module): module = module.module if not isinstance(module, GLMModel): module = module.model args.load = args.load_pretrained load_checkpoint(module, optimizer, lr_scheduler, args) args.load = None # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16: optimizer._model_params_to_master_params() if args.load is not None: load_checkpoint(model, optimizer, lr_scheduler, args) # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16: optimizer._model_params_to_master_params() timers('pretrained checkpoint').stop() args.iteration = 0 summary_writer = None if torch.distributed.get_rank() == 0: args.log_dir = get_log_dir(base=args.summary_dir, name=args.experiment_name) if os.path.exists(os.path.join(args.log_dir, "test_results.json") ) and args.load is None and not args.overwrite: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.log_dir)) summary_writer = get_sample_writer(log_dir=args.log_dir, iteration=args.iteration) print_and_save_args(args, verbose=False, log_dir=args.log_dir) # Print setup timing. print_rank_0('done with setups ...') timers.log([ 'train/valid/test dataset/dataloder', 'callback function', 'model and optimizer', 'pretrained checkpoint' ]) print_rank_0('training ...') # Finetune the model. score_dict = None if train_dataloader is not None and args.epochs > 0: best_iteration = _train(model, optimizer, lr_scheduler, forward_step, train_dataloader, valid_dataloader, end_of_epoch_callback, args, timers, summary_writer=summary_writer) if best_iteration is not None and end_of_train_callback is not None: args.load = os.path.join(args.save, "best") load_checkpoint(model, optimizer, lr_scheduler, args) args.load = None if end_of_train_callback is not None: score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) # Or just evaluate. else: if end_of_train_callback is not None: print_rank_0('evaluation only mode, setting epoch to -1') score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) if score_dict is not None and torch.distributed.get_rank() == 0: score_dict.update({"type": "test"}) with open(os.path.join(args.log_dir, "test_results.json"), "w") as output: output.write(json.dumps(score_dict) + "\n") print_rank_0('done :-)')
""" mask = (images < 0.999) mask = mask.all(dim=1) return mask.type_as(images) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", "--src_images_dir", required=True, type=str) parser.add_argument("-l", "--logs_dir", required=True, type=str) parser.add_argument("-st", "--style_image_path", required=True, type=str) parser.add_argument("-n", "--num_iters", required=True, type=int) parser.add_argument("-sc", "--style_coeff", default=10000, type=float) parser.add_argument("-cc", "--content_coeff", default=1, type=float) args = parser.parse_args() dst_dir = os.path.join(args.logs_dir, "data", utils.get_log_dir()) if not os.path.exists(dst_dir): os.makedirs(dst_dir) device = data_util.get_device() imsize = 512 loader = transforms.Compose( [transforms.Resize(imsize), transforms.ToTensor()]) style_img = utils.image_loader(args.style_image_path, loader, device) img_names = os.listdir(args.src_images_dir) for img_name in tqdm(img_names): img_path = os.path.join(args.src_images_dir, img_name) content_img = utils.image_loader(img_path, loader, device)
parser.add_argument("--root_dataset", type=str, default='./data/Pascal_VOC') parser.add_argument("--resume", type=str, default='') parser.add_argument("--fcn", type=str, default='32s', choices=['32s', '16s', '8s', '50', '101']) opts = parser.parse_args() # os.environ['CUDA_VISIBLE_DEVICES'] = str(opts.gpu_id) opts.cuda = get_cuda(torch.cuda.is_available() and opts.gpu_id != -1, opts.gpu_id) print('Cuda', opts.cuda) cfg = get_config()[1] opts.cfg = cfg if opts.mode in ['train', 'trainval']: opts.out = get_log_dir('fcn' + opts.fcn, 1, cfg) print('Output logs: ', opts.out) data = get_loader(opts) trainer = Trainer(data, opts) if opts.mode == 'val': trainer.Test() elif opts.mode == 'demo': trainer.Demo() else: trainer.Train()
parser = argparse.ArgumentParser() parser.add_argument("experiment_name") parser.add_argument("obj_id") parser.add_argument("num_iterations") arguments = parser.parse_args() full_name = arguments.experiment_name.split('/') obj_id=int(arguments.obj_id) num_iterations=int(arguments.num_iterations) experiment_name = full_name.pop() experiment_group = full_name.pop() if len(full_name) > 0 else '' log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group) ckpt_dir = os.path.join(log_dir, 'checkpoints_lambda250') checkpoint_file = u.get_checkpoint_basefilename(ckpt_dir) print('log_dir',log_dir) args = configparser.ConfigParser() cfg_file_path=glob.glob(os.path.join(log_dir,'*.cfg'))[0] args.read(cfg_file_path) tf.reset_default_graph() embedding_dim = 128 image_size=128 ci=4 path_embedding_data = './embedding92232s/{:02d}'.format(obj_id) #path to dir of info \bar_R embedding_size = 92232 normalize_images=True # Default false for non-textured TLESS CAD mesh, and True for texture meshes such as Linemod
parser.add_argument("--gpu_id", type=int, default=0) parser.add_argument("--backbone", type=str, default="vgg") parser.add_argument("--root_dataset", type=str, default="data/VOC/") parser.add_argument("--resume", type=str, default="") parser.add_argument("--fcn", type=str, default="32s", choices=["32s", "16s", "8s", "50", "101"]) opts = parser.parse_args() # os.environ['CUDA_VISIBLE_DEVICES'] = str(opts.gpu_id) opts.cuda = get_cuda(torch.cuda.is_available() and opts.gpu_id != -1, opts.gpu_id) print("Cuda", opts.cuda) cfg = get_config()[1] opts.cfg = cfg if opts.mode in ["train", "trainval"]: opts.out = get_log_dir("fcn" + opts.fcn, 1, cfg) print("Output logs: ", opts.out) data = get_loader(opts) trainer = Trainer(data, opts) if opts.mode == "val": trainer.Test() elif opts.mode == "demo": trainer.Demo() else: trainer.Train()
""" Train model 1、First train simple loss 2、Second train weight loss """ fm_model = TorchFM(feature_dim=feat_dim, num_dim=NUM_DIM, init_mean=INIT_MEAN) adam_opt = optim.Adam(fm_model.parameters(), lr=LEARNING_RATE) schedular = optim.lr_scheduler.StepLR(adam_opt, step_size=DECAY_FREQ, gamma=DECAY_GAMME) fm_learner = FMLearner(fm_model, adam_opt, schedular, db) fm_learner.compile(train_col='seq', valid_col='seq', test_col='seq', loss_callback=callback_simple_loss) fm_learner.fit(epoch=EPOCH, log_dir=get_log_dir('simple_topcoder', 'fm')) del fm_model T.cuda.empty_cache() fm_model = TorchFM(feature_dim=feat_dim, num_dim=NUM_DIM, init_mean=INIT_MEAN) adam_opt = optim.Adam(fm_model.parameters(), lr=LEARNING_RATE) schedular = optim.lr_scheduler.StepLR(adam_opt, step_size=DECAY_FREQ, gamma=DECAY_GAMME) fm_learner = FMLearner(fm_model, adam_opt, schedular, db) fm_learner.compile(train_col='seq', valid_col='seq', test_col='seq', loss_callback=callback_simple_weight_loss) fm_learner.fit(epoch=EPOCH, log_dir=get_log_dir('weight_topcoder', 'fm')) del fm_model
Unauthorized copying, distribution, reproduction, publication, use of this file, via any medium is strictly prohibited. Proprietary and confidential – June 2019 """ import os os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' import numpy as np from model import Model, save_freeze_tensorflow_model_for_inference, convert_to_tensor_rt, inference_from_tensor_rt_graph from preprocessing import preprocess from utils import init_configuration, download_data, get_log_dir, get_arguments_as_dict # read the parameters from the config file all_params = init_configuration(config_file='config/config.yaml') # getting log directory to save the model and results log_dir = get_log_dir(all_params) print('downloading data') train_path, test_path = download_data(reload=True) print('preprocessing data') dataset_train, dataset_test, dataset_train_lengths, dataset_test_lengths, dataset_test_for_predict, dataset_test_lengths_for_predict, x_test = preprocess( train_path, test_path, all_params) print('initialize and train the model') model = Model(log_dir, all_params) model.train(dataset_train, dataset_test, dataset_train_lengths, dataset_test_lengths) model.predict(dataset_test_for_predict, dataset_test_lengths_for_predict)
if __name__ == "__main__": # parse arguments parser = argparse.ArgumentParser() parser.add_argument("-src", "--src_img_path", required=True, type=str) parser.add_argument("-style", "--style_img_path", required=True, type=str) parser.add_argument("-n", "--num_iters", required=True, type=int) parser.add_argument("-l", "--log_dir", required=True, type=str) parser.add_argument("-sc", "--style_coeff", default=0.5, type=float) parser.add_argument("-cc", "--content_coeff", default=0.5, type=float) args = parser.parse_args() run_dir = os.path.join(args.log_dir, "runs", "img_style_transfer", utils.get_log_dir()) if not os.path.exists(run_dir): os.makedirs(run_dir) writer = SummaryWriter(run_dir, flush_secs=10) img_size = (512, 512) device = data_util.get_device() mean = torch.tensor([0.485, 0.456, 0.406], device=device, dtype=torch.float)[None, :, None, None] std = torch.tensor([0.229, 0.224, 0.225], device=device, dtype=torch.float)[None, :, None, None] # normalize = transforms.Normalize(mean=mean, std=std)
def main(): # 0. input arguments parser = argparse.ArgumentParser() parser.add_argument('-g', '--gpu', type=int, help='GPU device to use', default=0) parser.add_argument('-d', '--dataset', help='VOC, CamVid, SUNRGBD, Custom', default='CamVid') parser.add_argument('-dr', '--datasetroot', help='dataset root pth', default='/home/hongkai/PycharmProjects/Datasets') parser.add_argument('-dt', '--degradedtrain', help='o, bg, bm, hi, ho, ns, nsp', default='o') parser.add_argument('-dv', '--degradedval', help='o, bg, bm, hi, ho, ns, nsp', default='o') parser.add_argument('-ds', '--degradedtest', help='o, bg, bm, hi, ho, ns, nsp', default='o') parser.add_argument('-c', '--config', type=int, default=1, choices=configurations.keys()) parser.add_argument('-r', '--resume', help='Checkpoint path') args = parser.parse_args() gpu = args.gpu dataset = args.dataset dataset_root = args.datasetroot degradedtrain = args.degradedtrain degradedval = args.degradedval degradedtest = args.degradedtest cfg = configurations[args.config] out = utils.get_log_dir('fcn8s-atonce', args.config, cfg) resume = args.resume os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu) cuda = torch.cuda.is_available() torch.manual_seed(1337) if cuda: torch.cuda.manual_seed(1337) # 1. dataset root = osp.expanduser(osp.join(dataset_root, dataset)) kwargs = {'num_workers': 4, 'pin_memory': True} if cuda else {} if dataset == 'VOC': train_data = datasets.VOCSeg(root, split='train', dataset=degradedtrain, transform=True) val_data = datasets.VOCSeg(root, split='val', dataset=degradedval, transform=True) test_data = datasets.VOCSeg(root, split='test', dataset=degradedtest, transform=True) elif dataset == "CamVid": train_data = datasets.CamVidSeg(root, split='train', dataset=degradedtrain, transform=True) val_data = datasets.CamVidSeg(root, split='val', dataset=degradedval, transform=True) test_data = datasets.CamVidSeg(root, split='test', dataset=degradedtest, transform=True) elif dataset == "Cityscapes": train_data = datasets.CityscapesSeg(root, split='train', dataset=degradedtrain, transform=True) val_data = datasets.CityscapesSeg(root, split='val', dataset=degradedval, transform=True) test_data = datasets.CityscapesSeg(root, split='test', dataset=degradedtest, transform=True) elif dataset == "Custom": train_data = datasets.CustomSeg(root, split='train', dataset=degradedtrain, transform=True) val_data = datasets.CustomSeg(root, split='val', dataset=degradedval, transform=True) test_data = datasets.CustomSeg(root, split='test', dataset=degradedtest, transform=True) else: train_data = datasets.SUNSeg(root, split='train', dataset=degradedtrain, transform=True) val_data = datasets.SUNSeg(root, split='val', dataset=degradedval, transform=True) test_data = datasets.SUNSeg(root, split='test', dataset=degradedtest, transform=True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader(val_data, batch_size=1, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False, **kwargs) # 2. model model = models.FCN8sAtOnce(n_class=train_data.n_classes) start_epoch = 0 start_iteration = 0 if resume: checkpoint = torch.load(resume) model.load_state_dict(checkpoint['model_state_dict']) start_epoch = checkpoint['epoch'] start_iteration = checkpoint['iteration'] else: vgg16 = torchfcn.models.VGG16(pretrained=True) model.copy_params_from_vgg16(vgg16) device = torch.device("cuda" if cuda else "cpu") model = model.to(device) # 3. optimizer optim = torch.optim.SGD( [ {'params': utils.get_parameters(model, bias=False)}, {'params': utils.get_parameters(model, bias=True), 'lr': cfg['lr'] * 2, 'weight_decay': 0}, ], lr=cfg['lr'], momentum=cfg['momentum'], weight_decay=cfg['weight_decay']) if resume: optim.load_state_dict(checkpoint['optim_state_dict']) # 4. trainer trainer = Trainer( cuda=cuda, model=model, optimizer=optim, train_loader=train_loader, val_loader=val_loader, test_loader=test_loader, out=out, max_iter=cfg['max_iteration'], interval_validate=cfg.get('interval_validate', len(train_loader)), ) trainer.epoch = start_epoch trainer.iteration = start_iteration trainer.train()
parser.add_argument("--no-mem", action="store_true", default=False, help="don't use memory in the model") args = parser.parse_args() # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = "{}_{}_seed{}_{}".format(args.env, args.algo, args.seed, suffix) model_name = args.model or default_model_name # Define logger and Tensorboard writer and log script arguments logger = utils.get_logger(model_name) if args.tb: from tensorboardX import SummaryWriter writer = SummaryWriter(utils.get_log_dir(model_name)) logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Generate environments envs = [] for i in range(args.procs): env = gym.make(args.env) env.seed(args.seed + i) envs.append(env)