Beispiel #1
0
    def _init_models(args):
        if args.multi_env is not None:
            assert len(args.multi_demos) == len(args.multi_episodes)

        args.model = args.model or ImitationLearning.default_model_name(args)
        utils.configure_logging(args.model)
        logger = logging.getLogger(__name__)

        self.il_learn_forward = ImitationLearning(args)
        self.il_learn_backward = ImitationLearning(args)

        # Define logger and Tensorboard writer
        self.header = ([
            "update", "frames", "FPS", "duration", "entropy", "policy_loss",
            "train_accuracy"
        ] + ["validation_accuracy"])
        if args.multi_env is None:
            self.header.extend(
                ["validation_return", "validation_success_rate"])
        else:
            self.header.extend(
                ["validation_return_{}".format(env) for env in args.multi_env])
            self.header.extend([
                "validation_success_rate_{}".format(env)
                for env in args.multi_env
            ])
        writer = None
        if args.tb:
            from tensorboardX import SummaryWriter
            writer = SummaryWriter(utils.get_log_dir(args.model))

        # Define csv writer
        selof.csv_writer = None
        self.csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv')
        first_created = not os.path.exists(self.csv_path)
        # we don't buffer data going in the csv log, cause we assume
        # that one update will take much longer that one write to the log
        self.csv_writer = self.csv.writer(open(self.csv_path, 'a', 1))
        if first_created:
            self.csv_writer.writerow(self.header)

        # Get the status path
        self.status_path = os.path.join(utils.get_log_dir(args.model),
                                        'status.json')

        # Log command, availability of CUDA, and model
        logger.info(args)
        logger.info("CUDA available: {}".format(torch.cuda.is_available()))
        logger.info(self.il_learn_forward.acmodel)
Beispiel #2
0
def main(argv=None):
    FLAGS = tf.app.flags.FLAGS
    tf.app.flags.DEFINE_string(
        'training_data',
        '../data/tfrecords/jet_training_8101_pT-ALL_eta-ALL_Pythia.tfrecords',
        'the training data set'
    )
    tf.app.flags.DEFINE_string(
        'validation_data',
        '../data/tfrecords/jet_validation_2701_pT-ALL_eta-ALL_Pythia.tfrecords',
        'the validation data set'
    )
    tf.app.flags.DEFINE_integer('batch_size', 500, 'batch size')
    tf.app.flags.DEFINE_integer('num_epochs', 30, 'the number of epochs')

    log_dir = get_log_dir(dname='test', creation=True)

    train(
        tfrecords_path=FLAGS.training_data,
        tfevents_dir=log_dir.tfevents.training.path,
        ckpt_dir=log_dir.ckpt.path,
        benchmark_path=log_dir.path,
        batch_size=FLAGS.batch_size,
        num_epochs=FLAGS.num_epochs
    )
   
    evaluate(
        training_data=FLAGS.training_data,
        validation_data=FLAGS.validation_data,
        log_dir=log_dir
    )

    draw_all_qg_histograms(qg_histogram_dir=log_dir.qg_histogram)
 def __init__(self, name = "messages", show_date = True, separator = " ", extension = "txt"):
     """
     Initializes a new logger object. The defaults are for status and error message
     logging, but the if a new name is used, there will be a new directory in the log
     directory that will store the data made by the logger with that name
     
     Arguments:
         name: The logger's name. All logs will go into a directory with the logger's name
         show_date: Whether or not a date should be printed with every log message
         separator: The delimiter for all strings passed to the logger in a log command
         extension: The file-type for the log messages. Defaults to '.txt'.
     """
     
     self.name = name
     self.directory = utils.get_log_dir()
     self.show_date = show_date
     self.separator = separator
     self.extension = extension
     
     #check for log directory, create if necessary
     if not self.directory:
         new_dir = str.replace(utils.get_resource_files_prefix(), 
                                  "resources", "log")
         os.mkdir(new_dir)
     
     #check for log/name directory, create if doesn't exist
     try:
         os.makedirs("%s%s/" %(self.directory, self.name))
     except OSError:
         if not os.path.isdir("%s%s/" %(self.directory, self.name)):
             raise
         
     #check for name directory in log, create if necessary
     self.log("logger created")
Beispiel #4
0
def build_codebook_from_name(experiment_name,
                             experiment_group='',
                             return_dataset=False,
                             return_decoder=False):
    import os
    import configparser
    workspace_path = os.environ.get('AE_WORKSPACE_PATH')

    if workspace_path == None:
        print 'Please define a workspace path:\n'
        print 'export AE_WORKSPACE_PATH=/path/to/workspace\n'
        exit(-1)

    import utils as u
    import tensorflow as tf

    log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group)
    checkpoint_file = u.get_checkpoint_basefilename(log_dir)
    cfg_file_path = u.get_train_config_exp_file_path(log_dir, experiment_name)
    dataset_path = u.get_dataset_path(workspace_path)

    if os.path.exists(cfg_file_path):
        args = configparser.ConfigParser()
        args.read(cfg_file_path)
    else:
        print 'ERROR: Config File not found: ', cfg_file_path
        exit()

    with tf.variable_scope(experiment_name):
        dataset = build_dataset(dataset_path, args)
        x = tf.placeholder(tf.float32, [
            None,
        ] + list(dataset.shape))
        encoder = build_encoder(x, args)
        codebook = build_codebook(encoder, dataset, args)
        if return_decoder:
            reconst_target = tf.placeholder(tf.float32, [
                None,
            ] + list(dataset.shape))
            decoder = build_decoder(reconst_target, encoder, args)

    if return_dataset:
        if return_decoder:
            return codebook, dataset, decoder
        else:
            return codebook, dataset
    else:
        return codebook
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--log_dir',
                        type=str,
                        required=True,
                        help='the directory path of dataset')

    args = parser.parse_args()

    log_dir = get_log_dir(path=args.log_dir, creation=False)

    path_and_step = get_saved_model_paths(log_dir.saved_models.path)
    for i, (saved_model_path, step) in enumerate(path_and_step):
        print("\n\n\n[{i}/{total}]: {path}".format(i=i,
                                                   total=len(path_and_step),
                                                   path=saved_model_path))
        evaluate(saved_model_path, step, log_dir)
Beispiel #6
0
    def __init__(self, adj_mx, **kwargs):
        self._kwargs = kwargs
        self._data_kwargs = kwargs.get('data')
        self._model_kwargs = kwargs.get('model')
        self._train_kwargs = kwargs.get('train')
        self.dataset_name = self._data_kwargs['dataset_dir'].split('/')[-1]
        self.adj_mx = adj_mx
        self.model_params = dict()
        self.model_params['seq_len'] = 30
        self.K = [1, 5, 10, 20, 50, 100]

        model_name = 'net_act_orig'  # self._kwargs['model_name']
        self.log_file_name = utils.get_log_dir(log_dir=self._kwargs['log_dir'],
                                               model_name=model_name,
                                               dataset_name=self.dataset_name)
        if not os.path.exists(self._kwargs['save_dir']):
            os.makedirs(self._kwargs['save_dir'])
        if not os.path.exists(
                os.path.join(self._kwargs['save_dir'], self.dataset_name)):
            os.makedirs(
                os.path.join(self._kwargs['save_dir'], self.dataset_name))
        if not os.path.exists(
                os.path.join(self._kwargs['save_dir'], self.dataset_name,
                             self._kwargs['model_name'])):
            os.makedirs(
                os.path.join(self._kwargs['save_dir'], self.dataset_name,
                             self._kwargs['model_name']))

        log_level = self._kwargs.get('log_level', 'INFO')
        self._logger = utils.get_logger(self.log_file_name,
                                        name=__name__,
                                        level=log_level)
        self._writer = tf.summary.FileWriter(self.log_file_name)
        self._logger.info(json.dumps(kwargs, indent=2))
        self._saved_file_name = 'best_model.ckpt'

        user_id, reverse_user_id, item_id, reverse_item_id = \
            utils.load_ids(self._data_kwargs['dataset_dir'], self._data_kwargs['ids_file_name'])
        print(len(user_id), len(reverse_user_id), len(item_id),
              len(reverse_item_id))

        self.n_users = len(user_id)
        self.n_context = self._model_kwargs['context_size']

        data_examples, self.user_history, num_bins = utils.load_dataset_timestamp(
            self._data_kwargs['dataset_dir'],
            self._data_kwargs['dataset_name'], self.n_users, self.n_context,
            self.model_params['seq_len'])
        self.num_bins = num_bins

        self.model_params['batch_size'] = self._data_kwargs['batch_size']
        self.model_params['user_size'] = self.n_users
        self.model_params['item_size'] = len(item_id)
        self.model_params['state_size'] = self._model_kwargs['state_size']
        self.model_params['emb_size'] = self._model_kwargs['emb_size']
        self.model_params['lr'] = self._train_kwargs['base_lr']
        self.model_params['n_bins'] = self.num_bins
        self.model_params['context_size'] = self.n_context
        self.model_params['start_lr'] = len(
            data_examples) // self._data_kwargs['batch_size']
        self.model_params['min_lr'] = self._train_kwargs['min_learning_rate']
        self.model_params['use_attn'] = self._model_kwargs['use_attn']
        self.model_params['normalize'] = self._model_kwargs['normalize']
        self.model_params['max_diff'] = self._model_kwargs['max_diff']
        if self._model_kwargs['n_samples'] == -1:
            self.model_params['n_samples'] = len(item_id)
        else:
            self.model_params['n_samples'] = self._model_kwargs['n_samples']
        self.model_params['comb'] = self._model_kwargs['comb']

        self.data_iterator = utils.Loader(data_examples,
                                          options=self.model_params)
Beispiel #7
0
def finetune(args,
             train_valid_datasets_provider,
             model_kwargs,
             forward_step=finetune_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
    global tokenizer
    timers = Timers()
    tokenizer = prepare_tokenizer(args)
    pretrain_glm.tokenizer = tokenizer
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Train and validation data loaders.
    timers('train/valid/test dataset/dataloder').start()
    train_dataloader, valid_dataloader = None, None
    train_block_dataloader, valid_block_dataloader = None, None
    if train_valid_datasets_provider is not None and args.epochs > 0:
        if mpu.get_model_parallel_rank() == 0:
            train_dataset, valid_dataset = train_valid_datasets_provider(
                args, tokenizer)
            train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
                train_dataset, valid_dataset, args)
            if args.no_validation:
                valid_dataloader = None
            train_iters = torch.cuda.LongTensor([len(train_dataloader)])
        else:
            train_iters = torch.cuda.LongTensor([0])
        torch.distributed.broadcast(train_iters,
                                    mpu.get_model_parallel_src_rank(),
                                    group=mpu.get_model_parallel_group())
        if mpu.get_model_parallel_rank() != 0:
            args.train_iters_per_epoch = train_iters[0].item()
            args.train_iters = args.epochs * args.train_iters_per_epoch

            train_dataloader = FakeDataloader(args.train_iters_per_epoch)
            if args.no_validation:
                valid_dataloader = None
            else:
                valid_dataloader = FakeDataloader(None)
        if args.block_lm_ratio > 0.0:
            if mpu.get_model_parallel_rank() == 0:
                train_block_dataset, valid_block_dataset = train_valid_datasets_provider(
                    args, tokenizer, pattern_text=True)
                train_block_dataloader = make_data_loader(
                    train_block_dataset,
                    tokenizer,
                    args.batch_size * mpu.get_data_parallel_world_size(),
                    args.train_iters,
                    args,
                    shuffle=True,
                    block_collate=True)
                valid_block_dataloader = make_data_loader(
                    valid_block_dataset,
                    tokenizer,
                    args.batch_size * mpu.get_data_parallel_world_size(),
                    (args.train_iters // args.eval_interval + 1) *
                    args.eval_iters,
                    args,
                    shuffle=True,
                    block_collate=True)
            else:
                train_block_dataloader = FakeDataloader(args.train_iters)
                valid_block_dataloader = FakeDataloader(None)
            train_block_dataloader, valid_block_dataloader = iter(
                train_block_dataloader), iter(valid_block_dataloader)

    timers('train/valid/test dataset/dataloder').stop()
    # Build calback function.
    timers('callback function').start()
    end_of_epoch_callback, end_of_train_callback = None, None
    if end_of_epoch_callback_provider is not None:
        if train_valid_datasets_provider is not None and args.epochs > 0 and not args.no_validation:
            end_of_epoch_callback = end_of_epoch_callback_provider(
                args, tokenizer, is_test=False)
        end_of_train_callback = end_of_epoch_callback_provider(args,
                                                               tokenizer,
                                                               is_test=True)
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(
        args, **model_kwargs)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
    timers('pretrained checkpoint').start()
    if args.load_pretrained is not None and not args.pretrained_bert:
        task_tokens = None
        if args.continuous_prompt and args.prompt_init:
            if mpu.get_model_parallel_rank() == 0:
                dataset = train_dataloader.dataset
                processor, pvp = dataset.processor, dataset.pvp
                task_tokens = []
                for label in processor.get_labels():
                    verbalizer = pvp.verbalize(label)[0]
                    verbalizer_ids = tokenizer.EncodeAsIds(
                        verbalizer).tokenization
                    task_tokens += verbalizer_ids
                print_rank_0("Task tokens: " +
                             tokenizer.DecodeIds(task_tokens))
                num_task_tokens = len(task_tokens)
            else:
                num_task_tokens, task_tokens = 0, []
            num_task_tokens = torch.cuda.LongTensor([num_task_tokens])
            torch.distributed.broadcast(num_task_tokens,
                                        mpu.get_model_parallel_src_rank(),
                                        group=mpu.get_model_parallel_group())
            num_task_tokens = num_task_tokens.item()
            if num_task_tokens > 0:
                if mpu.get_model_parallel_rank() == 0:
                    task_tokens = torch.cuda.LongTensor(task_tokens)
                else:
                    task_tokens = torch.empty(
                        num_task_tokens,
                        device=torch.cuda.current_device(),
                        dtype=torch.long)
                torch.distributed.broadcast(
                    task_tokens,
                    mpu.get_model_parallel_src_rank(),
                    group=mpu.get_model_parallel_group())
                task_tokens = task_tokens.tolist()
        with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                      timeout=-1):
            load_pretrained(model,
                            args.load_pretrained,
                            args,
                            task_tokens=task_tokens)
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16 and optimizer is not None:
            if args.deepspeed:
                optimizer.refresh_fp32_params()
            else:
                optimizer._model_params_to_master_params()
    if args.load is not None:
        with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                      timeout=-1):
            load_checkpoint(model,
                            optimizer,
                            lr_scheduler,
                            args,
                            no_deepspeed=args.no_deepspeed_load)
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16 and optimizer is not None:
            if args.deepspeed:
                optimizer.refresh_fp32_params()
            else:
                optimizer._model_params_to_master_params()
    torch.distributed.barrier()
    timers('pretrained checkpoint').stop()
    args.iteration = 0
    summary_writer = None
    if torch.distributed.get_rank() == 0:
        args.log_dir = get_log_dir(base=args.summary_dir,
                                   name=args.experiment_name)
        if os.path.exists(os.path.join(args.log_dir, "test_results.json")
                          ) and args.load is None and not args.overwrite:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.log_dir))
        summary_writer = get_sample_writer(log_dir=args.log_dir,
                                           iteration=args.iteration)
        print_and_save_args(args, verbose=True, log_dir=args.log_dir)

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log([
        'train/valid/test dataset/dataloder', 'callback function',
        'model and optimizer', 'pretrained checkpoint'
    ])
    print_rank_0('training ...')

    # Finetune the model.
    score_dict = None
    if train_dataloader is not None and args.epochs > 0:
        if args.block_lm_ratio > 0.0:
            forward_step = mix_forward_step
        best_iteration = _train(model,
                                optimizer,
                                lr_scheduler,
                                forward_step,
                                (train_dataloader, train_block_dataloader),
                                (valid_dataloader, valid_block_dataloader),
                                end_of_epoch_callback,
                                args,
                                timers,
                                summary_writer=summary_writer)
        if end_of_train_callback is not None and best_iteration is not None:
            with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                          timeout=-1):
                args.load = os.path.join(args.save, "best")
                load_checkpoint(model,
                                optimizer,
                                lr_scheduler,
                                args,
                                no_load_optim=True,
                                no_deepspeed=True)
                args.load = None
        torch.distributed.barrier()
        if end_of_train_callback is not None:
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    # Or just evaluate.
    else:
        if end_of_train_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    if score_dict is not None and torch.distributed.get_rank() == 0:
        score_dict.update({"type": "test"})
        with open(os.path.join(args.log_dir, "test_results.json"),
                  "w") as output:
            output.write(json.dumps(score_dict) + "\n")

    print_rank_0('done :-)')
def main():
    workspace_path = os.environ.get('AE_WORKSPACE_PATH')

    if workspace_path == None:
        print 'Please define a workspace path:\n'
        print 'export AE_WORKSPACE_PATH=/path/to/workspace\n'
        exit(-1)

    gentle_stop = np.array((1, ), dtype=np.bool)
    gentle_stop[0] = False

    def on_ctrl_c(signal, frame):
        gentle_stop[0] = True

    signal.signal(signal.SIGINT, on_ctrl_c)

    parser = argparse.ArgumentParser()
    parser.add_argument("experiment_name")
    parser.add_argument("-d", action='store_true', default=False)
    parser.add_argument("-gen", action='store_true', default=False)
    arguments = parser.parse_args()

    full_name = arguments.experiment_name.split('/')

    experiment_name = full_name.pop()
    experiment_group = full_name.pop() if len(full_name) > 0 else ''

    debug_mode = arguments.d
    generate_data = arguments.gen

    cfg_file_path = u.get_config_file_path(workspace_path, experiment_name,
                                           experiment_group)
    log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group)
    checkpoint_file = u.get_checkpoint_basefilename(log_dir)
    ckpt_dir = u.get_checkpoint_dir(log_dir)
    train_fig_dir = u.get_train_fig_dir(log_dir)
    dataset_path = u.get_dataset_path(workspace_path)

    if not os.path.exists(cfg_file_path):
        print 'Could not find config file:\n'
        print '{}\n'.format(cfg_file_path)
        exit(-1)

    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)
    if not os.path.exists(train_fig_dir):
        os.makedirs(train_fig_dir)
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)

    args = configparser.ConfigParser()
    args.read(cfg_file_path)

    shutil.copy2(cfg_file_path, log_dir)

    with tf.variable_scope(experiment_name):
        dataset = factory.build_dataset(dataset_path, args)
        queue = factory.build_queue(dataset, args)
        encoder = factory.build_encoder(queue.x, args, is_training=True)
        decoder = factory.build_decoder(queue.y,
                                        encoder,
                                        args,
                                        is_training=True)
        ae = factory.build_ae(encoder, decoder, args)
        codebook = factory.build_codebook(encoder, dataset, args)
        train_op = factory.build_train_op(ae, args)
        saver = tf.train.Saver(save_relative_paths=True)

    num_iter = args.getint(
        'Training', 'NUM_ITER') if not debug_mode else np.iinfo(np.int32).max
    save_interval = args.getint('Training', 'SAVE_INTERVAL')
    model_type = args.get('Dataset', 'MODEL')

    if model_type == 'dsprites':
        dataset.get_sprite_training_images(args)
    else:
        dataset.get_training_images(dataset_path, args)
        dataset.load_bg_images(dataset_path)

    if generate_data:
        print 'finished generating synthetic training data for ' + experiment_name
        print 'exiting...'
        exit()

    bar = progressbar.ProgressBar(maxval=num_iter,
                                  widgets=[
                                      ' [',
                                      progressbar.Timer(), ' | ',
                                      progressbar.Counter('%0{}d / {}'.format(
                                          len(str(num_iter)), num_iter)),
                                      ' ] ',
                                      progressbar.Bar(), ' (',
                                      progressbar.ETA(), ') '
                                  ])

    gpu_options = tf.GPUOptions(allow_growth=True,
                                per_process_gpu_memory_fraction=0.9)
    config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(config=config) as sess:

        chkpt = tf.train.get_checkpoint_state(ckpt_dir)
        if chkpt and chkpt.model_checkpoint_path:
            saver.restore(sess, chkpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        merged_loss_summary = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(ckpt_dir, sess.graph)

        if not debug_mode:
            print 'Training with %s model' % args.get(
                'Dataset', 'MODEL'), os.path.basename(
                    args.get('Paths', 'MODEL_PATH'))
            bar.start()

        queue.start(sess)
        for i in xrange(ae.global_step.eval(), num_iter):
            if not debug_mode:
                sess.run(train_op)
                if i % 10 == 0:
                    loss = sess.run(merged_loss_summary)
                    summary_writer.add_summary(loss, i)

                bar.update(i)
                if (i + 1) % save_interval == 0:
                    saver.save(sess,
                               checkpoint_file,
                               global_step=ae.global_step)

                    this_x, this_y = sess.run([queue.x, queue.y])
                    reconstr_train = sess.run(decoder.x,
                                              feed_dict={queue.x: this_x})
                    train_imgs = np.hstack(
                        (u.tiles(this_x, 4,
                                 4), u.tiles(reconstr_train, 4,
                                             4), u.tiles(this_y, 4, 4)))
                    cv2.imwrite(
                        os.path.join(train_fig_dir,
                                     'training_images_%s.png' % i),
                        train_imgs * 255)
            else:

                this_x, this_y = sess.run([queue.x, queue.y])
                reconstr_train = sess.run(decoder.x,
                                          feed_dict={queue.x: this_x})
                cv2.imshow(
                    'sample batch',
                    np.hstack((u.tiles(this_x, 3,
                                       3), u.tiles(reconstr_train, 3,
                                                   3), u.tiles(this_y, 3, 3))))
                k = cv2.waitKey(0)
                if k == 27:
                    break

            if gentle_stop[0]:
                break

        queue.stop(sess)
        if not debug_mode:
            bar.finish()
        if not gentle_stop[0] and not debug_mode:
            print 'To create the embedding run:\n'
            print 'ae_embed {}\n'.format(full_name)
Beispiel #9
0
parser = argparse.ArgumentParser()
parser.add_argument("experiment_name")
parser.add_argument("obj_id")
arguments = parser.parse_args()

full_name = arguments.experiment_name.split('/')
obj_id = arguments.obj_id

experiment_name = full_name.pop()
experiment_group = full_name.pop() if len(full_name) > 0 else ''

cfg_file_path = u.get_config_file_path(path_workspace, experiment_name,
                                       experiment_group)
list_models = [int(obj_id)]

log_dir = u.get_log_dir(path_workspace, experiment_name, experiment_group)
ckpt_dir = os.path.join(log_dir,
                        'checkpoints_lambda{:d}'.format(int(lambda_reconst)))
checkpoint_file = u.get_checkpoint_basefilename(ckpt_dir)
train_fig_dir = os.path.join(
    log_dir, 'train_figures_lambda{:d}'.format(int(lambda_reconst)))
dataset_path = u.get_dataset_path(path_workspace)
print('dataset_path', dataset_path)

if not os.path.exists(cfg_file_path):
    print('Could not find config file:\n')
    print('{}\n'.format(cfg_file_path))
    exit(-1)

if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)
Beispiel #10
0
                        help='Learning Rate')
    parser.add_argument('--w_decay',
                        nargs='?',
                        type=float,
                        default=2e-4,
                        help='Weight Decay')
    parser.add_argument('--momentum',
                        nargs='?',
                        type=float,
                        default=0.9,
                        help='momentum')
    parser.add_argument('--lr_decay',
                        nargs='?',
                        type=float,
                        default=1e-1,
                        help='Learning Rate Decay')
    parser.add_argument('--resume',
                        nargs='?',
                        type=str,
                        default='',
                        help='Resume training')
    parser.add_argument('--dataset',
                        nargs='?',
                        type=str,
                        default='camvid',
                        help='Dataset to use [\'pascal, camvid, ade20k etc\']')
    args = parser.parse_args()
    out = get_log_dir(here, 'bilinearRes')
    net_name = 'bilinearRes'
    train(args, out, net_name)
Beispiel #11
0
def main():
    workspace_path = os.environ.get('AE_WORKSPACE_PATH')

    if workspace_path == None:
        print 'Please define a workspace path:\n'
        print 'export AE_WORKSPACE_PATH=/path/to/workspace\n'
        exit(-1)

    parser = argparse.ArgumentParser()
    parser.add_argument("experiment_name")
    parser.add_argument('--at_step', default=None, required=False)
    arguments = parser.parse_args()
    full_name = arguments.experiment_name.split('/')

    experiment_name = full_name.pop()
    experiment_group = full_name.pop() if len(full_name) > 0 else ''
    at_step = arguments.at_step

    cfg_file_path = u.get_config_file_path(workspace_path, experiment_name,
                                           experiment_group)
    log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group)
    checkpoint_file = u.get_checkpoint_basefilename(log_dir)
    ckpt_dir = u.get_checkpoint_dir(log_dir)
    dataset_path = u.get_dataset_path(workspace_path)

    print checkpoint_file
    print ckpt_dir
    print '#' * 20

    if not os.path.exists(cfg_file_path):
        print 'Could not find config file:\n'
        print '{}\n'.format(cfg_file_path)
        exit(-1)

    args = configparser.ConfigParser()
    args.read(cfg_file_path)

    with tf.variable_scope(experiment_name):
        dataset = factory.build_dataset(dataset_path, args)
        queue = factory.build_queue(dataset, args)
        encoder = factory.build_encoder(queue.x, args)
        decoder = factory.build_decoder(queue.y, encoder, args)
        ae = factory.build_ae(encoder, decoder, args)
        codebook = factory.build_codebook(encoder, dataset, args)
        saver = tf.train.Saver(save_relative_paths=True)

    batch_size = args.getint('Training', 'BATCH_SIZE')
    model = args.get('Dataset', 'MODEL')

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(config=config) as sess:

        print ckpt_dir
        print '#' * 20

        factory.restore_checkpoint(sess, saver, ckpt_dir, at_step=at_step)

        # chkpt = tf.train.get_checkpoint_state(ckpt_dir)
        # if chkpt and chkpt.model_checkpoint_path:
        #     print chkpt.model_checkpoint_path
        #     saver.restore(sess, chkpt.model_checkpoint_path)
        # else:
        #     print 'No checkpoint found. Expected one in:\n'
        #     print '{}\n'.format(ckpt_dir)
        #     exit(-1)

        if model == 'dsprites':
            codebook.update_embedding_dsprites(sess, args)
        else:
            codebook.update_embedding(sess, batch_size)

        print 'Saving new checkoint ..',

        saver.save(sess, checkpoint_file, global_step=ae.global_step)

        print 'done',
Beispiel #12
0
def main():
    """Main training program."""

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False
    # Timer.
    timers = Timers()

    # Arguments.
    args = get_args()
    args.mem_length = args.mem_length if args.transformer_xl else 0
    if args.load and not args.new_save_directory:
        args.experiment_name = os.path.basename(os.path.normpath(args.load))
    else:
        args.experiment_name = args.experiment_name + datetime.now().strftime(
            "%m-%d-%H-%M")
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # Data stuff.
    global tokenizer
    tokenizer = prepare_tokenizer(args)
    train_data, val_data, test_data, = get_train_val_test_data(args, tokenizer)
    multi_train_data, multi_val_data = None, None
    if args.multi_task_ratio > 0.0:
        multi_train_data, multi_val_data = build_multi_task_dataset(
            args, tokenizer)

    # Model, optimizer, and learning rate.
    model, optimizer, lr_scheduler = setup_model_and_optimizer(args)

    if args.load is not None:
        with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                      timeout=-1):
            args.iteration = load_checkpoint(model, optimizer, lr_scheduler,
                                             args)
    else:
        args.iteration = 0
    torch.distributed.barrier()
    if args.switch_linear:
        lr_scheduler.switch_linear(args)

    summary_writer = None
    if torch.distributed.get_rank() == 0:
        print('Pretrain GPT2 model')
        args.log_dir = None
        if args.train_iters > 0:
            args.log_dir = get_log_dir(base=args.summary_dir,
                                       name=args.experiment_name)
            summary_writer = get_sample_writer(log_dir=args.log_dir,
                                               iteration=args.iteration)
        print_and_save_args(args, verbose=True, log_dir=args.log_dir)

    # Resume data loader if necessary.
    if args.resume_dataloader:
        print_rank_0("Resume dataloader")
        if train_data is not None:
            train_data.batch_sampler.start_iter = args.iteration % len(
                train_data)
        if val_data is not None:
            start_iter_val = (args.iteration //
                              args.eval_interval) * args.eval_iters
            val_data.batch_sampler.start_iter = start_iter_val % len(val_data)
        if multi_train_data is not None:
            multi_train_data.batch_sampler.start_iter = int(
                args.iteration * args.multi_task_ratio) % len(multi_train_data)
        if multi_val_data is not None:
            start_iter_val = (args.iteration // args.eval_interval
                              ) * args.eval_iters * args.multi_task_ratio
            multi_val_data.batch_sampler.start_iter = start_iter_val % len(
                multi_val_data)
    if train_data is not None:
        train_data_iterator = iter(train_data)
    else:
        train_data_iterator = None
    if multi_train_data is not None:
        multi_train_iterator = iter(multi_train_data)
    else:
        multi_train_iterator = None
    if val_data is not None:
        val_data_iterator = iter(val_data)
    else:
        val_data_iterator = None
    if multi_val_data is not None:
        multi_val_iterator = iter(multi_val_data)
    else:
        multi_val_iterator = None

    # TODO: figure out how to properly set this especially when resuming training
    iteration = 0
    if args.train_iters > 0:
        if args.do_train:
            with ExitStack() as stack:

                def save_on_exit(args_, model_, optimizer_, lr_scheduler_):
                    save_checkpoint(args_.iteration, model_, optimizer_,
                                    lr_scheduler_, args_)

                # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler)
                iteration, skipped = train(
                    model,
                    optimizer,
                    lr_scheduler, (train_data_iterator, multi_train_iterator),
                    (val_data_iterator, multi_val_iterator),
                    timers,
                    args,
                    summary_writer=summary_writer)

        if args.do_valid:
            prefix = 'the end of training for val data'
            val_loss = evaluate_and_print_results(
                prefix,
                val_data_iterator,
                model,
                args,
                timers,
                verbose=False,
                forward_step_func=forward_step)

    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler, args)

    if test_data is not None:
        test_data_iterator = iter(test_data)
    else:
        test_data_iterator = None

    if args.do_test:
        # Run on test data.
        prefix = 'the end of training for test data'
        evaluate_and_print_results(prefix, (test_data_iterator, None),
                                   model,
                                   args,
                                   timers,
                                   verbose=True,
                                   forward_step_func=forward_step)
Beispiel #13
0
def finetune(args,
             train_valid_datasets_provider,
             model_kwargs,
             forward_step=finetune_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
    global tokenizer
    timers = Timers()
    tokenizer = prepare_tokenizer(args)
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Train and validation data loaders.
    timers('train/valid/test dataset/dataloder').start()
    train_dataloader, valid_dataloader = None, None
    if train_valid_datasets_provider is not None and args.epochs > 0:
        train_dataset, valid_dataset = train_valid_datasets_provider(
            args, tokenizer)
        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
            train_dataset, valid_dataset, args)
    timers('train/valid/test dataset/dataloder').stop()
    # Build calback function.
    timers('callback function').start()
    end_of_epoch_callback, end_of_train_callback = None, None
    if end_of_epoch_callback_provider is not None:
        if train_valid_datasets_provider is not None and args.epochs > 0:
            end_of_epoch_callback = end_of_epoch_callback_provider(
                args, tokenizer, is_test=False)
        end_of_train_callback = end_of_epoch_callback_provider(args,
                                                               tokenizer,
                                                               is_test=True)
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(
        args, **model_kwargs)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
    timers('pretrained checkpoint').start()
    if args.load_pretrained is not None and not args.pretrained_bert and not args.load:
        module = model
        if isinstance(module, (LocalDDP, TorchDDP)):
            module = module.module
        if isinstance(module, FP16_Module):
            module = module.module
        if not isinstance(module, GLMModel):
            module = module.model
        args.load = args.load_pretrained
        load_checkpoint(module, optimizer, lr_scheduler, args)
        args.load = None
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
    if args.load is not None:
        load_checkpoint(model, optimizer, lr_scheduler, args)
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
    timers('pretrained checkpoint').stop()
    args.iteration = 0
    summary_writer = None
    if torch.distributed.get_rank() == 0:
        args.log_dir = get_log_dir(base=args.summary_dir,
                                   name=args.experiment_name)
        if os.path.exists(os.path.join(args.log_dir, "test_results.json")
                          ) and args.load is None and not args.overwrite:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.log_dir))
        summary_writer = get_sample_writer(log_dir=args.log_dir,
                                           iteration=args.iteration)
        print_and_save_args(args, verbose=False, log_dir=args.log_dir)

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log([
        'train/valid/test dataset/dataloder', 'callback function',
        'model and optimizer', 'pretrained checkpoint'
    ])
    print_rank_0('training ...')

    # Finetune the model.
    score_dict = None
    if train_dataloader is not None and args.epochs > 0:
        best_iteration = _train(model,
                                optimizer,
                                lr_scheduler,
                                forward_step,
                                train_dataloader,
                                valid_dataloader,
                                end_of_epoch_callback,
                                args,
                                timers,
                                summary_writer=summary_writer)
        if best_iteration is not None and end_of_train_callback is not None:
            args.load = os.path.join(args.save, "best")
            load_checkpoint(model, optimizer, lr_scheduler, args)
            args.load = None
        if end_of_train_callback is not None:
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    # Or just evaluate.
    else:
        if end_of_train_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    if score_dict is not None and torch.distributed.get_rank() == 0:
        score_dict.update({"type": "test"})
        with open(os.path.join(args.log_dir, "test_results.json"),
                  "w") as output:
            output.write(json.dumps(score_dict) + "\n")

    print_rank_0('done :-)')
Beispiel #14
0
    """
    mask = (images < 0.999)
    mask = mask.all(dim=1)
    return mask.type_as(images)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--src_images_dir", required=True, type=str)
    parser.add_argument("-l", "--logs_dir", required=True, type=str)
    parser.add_argument("-st", "--style_image_path", required=True, type=str)
    parser.add_argument("-n", "--num_iters", required=True, type=int)
    parser.add_argument("-sc", "--style_coeff", default=10000, type=float)
    parser.add_argument("-cc", "--content_coeff", default=1, type=float)
    args = parser.parse_args()
    dst_dir = os.path.join(args.logs_dir, "data", utils.get_log_dir())
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)

    device = data_util.get_device()
    imsize = 512
    loader = transforms.Compose(
        [transforms.Resize(imsize),
         transforms.ToTensor()])

    style_img = utils.image_loader(args.style_image_path, loader, device)

    img_names = os.listdir(args.src_images_dir)
    for img_name in tqdm(img_names):
        img_path = os.path.join(args.src_images_dir, img_name)
        content_img = utils.image_loader(img_path, loader, device)
Beispiel #15
0
    parser.add_argument("--root_dataset",
                        type=str,
                        default='./data/Pascal_VOC')
    parser.add_argument("--resume", type=str, default='')
    parser.add_argument("--fcn",
                        type=str,
                        default='32s',
                        choices=['32s', '16s', '8s', '50', '101'])
    opts = parser.parse_args()

    # os.environ['CUDA_VISIBLE_DEVICES'] = str(opts.gpu_id)
    opts.cuda = get_cuda(torch.cuda.is_available() and opts.gpu_id != -1,
                         opts.gpu_id)
    print('Cuda', opts.cuda)
    cfg = get_config()[1]
    opts.cfg = cfg

    if opts.mode in ['train', 'trainval']:
        opts.out = get_log_dir('fcn' + opts.fcn, 1, cfg)
        print('Output logs: ', opts.out)

    data = get_loader(opts)

    trainer = Trainer(data, opts)
    if opts.mode == 'val':
        trainer.Test()
    elif opts.mode == 'demo':
        trainer.Demo()
    else:
        trainer.Train()
Beispiel #16
0
parser = argparse.ArgumentParser()
parser.add_argument("experiment_name")
parser.add_argument("obj_id")
parser.add_argument("num_iterations")


arguments = parser.parse_args()

full_name = arguments.experiment_name.split('/')
obj_id=int(arguments.obj_id)
num_iterations=int(arguments.num_iterations)

experiment_name = full_name.pop()
experiment_group = full_name.pop() if len(full_name) > 0 else ''

log_dir = u.get_log_dir(workspace_path, experiment_name, experiment_group)
ckpt_dir = os.path.join(log_dir, 'checkpoints_lambda250')
checkpoint_file = u.get_checkpoint_basefilename(ckpt_dir)
print('log_dir',log_dir)
args = configparser.ConfigParser()
cfg_file_path=glob.glob(os.path.join(log_dir,'*.cfg'))[0]
args.read(cfg_file_path)

tf.reset_default_graph()

embedding_dim = 128
image_size=128
ci=4
path_embedding_data = './embedding92232s/{:02d}'.format(obj_id) #path to dir of info \bar_R
embedding_size = 92232
normalize_images=True # Default false for non-textured TLESS CAD mesh, and True for texture meshes such as Linemod
Beispiel #17
0
    parser.add_argument("--gpu_id", type=int, default=0)
    parser.add_argument("--backbone", type=str, default="vgg")
    parser.add_argument("--root_dataset", type=str, default="data/VOC/")
    parser.add_argument("--resume", type=str, default="")
    parser.add_argument("--fcn",
                        type=str,
                        default="32s",
                        choices=["32s", "16s", "8s", "50", "101"])
    opts = parser.parse_args()

    # os.environ['CUDA_VISIBLE_DEVICES'] = str(opts.gpu_id)
    opts.cuda = get_cuda(torch.cuda.is_available() and opts.gpu_id != -1,
                         opts.gpu_id)
    print("Cuda", opts.cuda)
    cfg = get_config()[1]
    opts.cfg = cfg

    if opts.mode in ["train", "trainval"]:
        opts.out = get_log_dir("fcn" + opts.fcn, 1, cfg)
        print("Output logs: ", opts.out)

    data = get_loader(opts)

    trainer = Trainer(data, opts)
    if opts.mode == "val":
        trainer.Test()
    elif opts.mode == "demo":
        trainer.Demo()
    else:
        trainer.Train()
Beispiel #18
0
"""
Train model
1、First train simple loss
2、Second train weight loss
"""
fm_model = TorchFM(feature_dim=feat_dim, num_dim=NUM_DIM, init_mean=INIT_MEAN)
adam_opt = optim.Adam(fm_model.parameters(), lr=LEARNING_RATE)
schedular = optim.lr_scheduler.StepLR(adam_opt,
                                      step_size=DECAY_FREQ,
                                      gamma=DECAY_GAMME)
fm_learner = FMLearner(fm_model, adam_opt, schedular, db)
fm_learner.compile(train_col='seq',
                   valid_col='seq',
                   test_col='seq',
                   loss_callback=callback_simple_loss)
fm_learner.fit(epoch=EPOCH, log_dir=get_log_dir('simple_topcoder', 'fm'))
del fm_model
T.cuda.empty_cache()

fm_model = TorchFM(feature_dim=feat_dim, num_dim=NUM_DIM, init_mean=INIT_MEAN)
adam_opt = optim.Adam(fm_model.parameters(), lr=LEARNING_RATE)
schedular = optim.lr_scheduler.StepLR(adam_opt,
                                      step_size=DECAY_FREQ,
                                      gamma=DECAY_GAMME)
fm_learner = FMLearner(fm_model, adam_opt, schedular, db)
fm_learner.compile(train_col='seq',
                   valid_col='seq',
                   test_col='seq',
                   loss_callback=callback_simple_weight_loss)
fm_learner.fit(epoch=EPOCH, log_dir=get_log_dir('weight_topcoder', 'fm'))
del fm_model
Beispiel #19
0
Unauthorized copying, distribution, reproduction, publication, use of this file, via any medium is strictly prohibited.
Proprietary and confidential – June 2019
"""
import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import numpy as np
from model import Model, save_freeze_tensorflow_model_for_inference, convert_to_tensor_rt, inference_from_tensor_rt_graph
from preprocessing import preprocess
from utils import init_configuration, download_data, get_log_dir, get_arguments_as_dict

# read the parameters from the config file
all_params = init_configuration(config_file='config/config.yaml')

# getting log directory to save the model and results
log_dir = get_log_dir(all_params)

print('downloading data')
train_path, test_path = download_data(reload=True)

print('preprocessing data')
dataset_train, dataset_test, dataset_train_lengths, dataset_test_lengths, dataset_test_for_predict, dataset_test_lengths_for_predict, x_test = preprocess(
    train_path, test_path, all_params)

print('initialize and train the model')
model = Model(log_dir, all_params)
model.train(dataset_train, dataset_test, dataset_train_lengths,
            dataset_test_lengths)

model.predict(dataset_test_for_predict, dataset_test_lengths_for_predict)

if __name__ == "__main__":
    # parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-src", "--src_img_path", required=True, type=str)
    parser.add_argument("-style", "--style_img_path", required=True, type=str)
    parser.add_argument("-n", "--num_iters", required=True, type=int)
    parser.add_argument("-l", "--log_dir", required=True, type=str)
    parser.add_argument("-sc", "--style_coeff", default=0.5, type=float)
    parser.add_argument("-cc", "--content_coeff", default=0.5, type=float)

    args = parser.parse_args()

    run_dir = os.path.join(args.log_dir, "runs", "img_style_transfer",
                           utils.get_log_dir())
    if not os.path.exists(run_dir):
        os.makedirs(run_dir)
    writer = SummaryWriter(run_dir, flush_secs=10)
    img_size = (512, 512)
    device = data_util.get_device()

    mean = torch.tensor([0.485, 0.456, 0.406],
                        device=device,
                        dtype=torch.float)[None, :, None, None]
    std = torch.tensor([0.229, 0.224, 0.225], device=device,
                       dtype=torch.float)[None, :, None, None]

    # normalize = transforms.Normalize(mean=mean, std=std)

def main():
    # 0. input arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-g', '--gpu', type=int, help='GPU device to use', default=0)
    parser.add_argument('-d', '--dataset', help='VOC, CamVid, SUNRGBD, Custom', default='CamVid')
    parser.add_argument('-dr', '--datasetroot', help='dataset root pth', default='/home/hongkai/PycharmProjects/Datasets')
    parser.add_argument('-dt', '--degradedtrain', help='o, bg, bm, hi, ho, ns, nsp', default='o')
    parser.add_argument('-dv', '--degradedval', help='o, bg, bm, hi, ho, ns, nsp', default='o')
    parser.add_argument('-ds', '--degradedtest', help='o, bg, bm, hi, ho, ns, nsp', default='o')
    parser.add_argument('-c', '--config', type=int, default=1, choices=configurations.keys())
    parser.add_argument('-r', '--resume', help='Checkpoint path')
    args = parser.parse_args()

    gpu = args.gpu
    dataset = args.dataset
    dataset_root = args.datasetroot
    degradedtrain = args.degradedtrain
    degradedval = args.degradedval
    degradedtest = args.degradedtest
    cfg = configurations[args.config]
    out = utils.get_log_dir('fcn8s-atonce', args.config, cfg)
    resume = args.resume

    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
    cuda = torch.cuda.is_available()
    torch.manual_seed(1337)
    if cuda:
        torch.cuda.manual_seed(1337)

    # 1. dataset
    root = osp.expanduser(osp.join(dataset_root, dataset))
    kwargs = {'num_workers': 4, 'pin_memory': True} if cuda else {}
    if dataset == 'VOC':
        train_data = datasets.VOCSeg(root, split='train', dataset=degradedtrain, transform=True)
        val_data = datasets.VOCSeg(root, split='val', dataset=degradedval, transform=True)
        test_data = datasets.VOCSeg(root, split='test', dataset=degradedtest, transform=True)
    elif dataset == "CamVid":
        train_data = datasets.CamVidSeg(root, split='train', dataset=degradedtrain, transform=True)
        val_data = datasets.CamVidSeg(root, split='val', dataset=degradedval, transform=True)
        test_data = datasets.CamVidSeg(root, split='test', dataset=degradedtest, transform=True)
    elif dataset == "Cityscapes":
        train_data = datasets.CityscapesSeg(root, split='train', dataset=degradedtrain, transform=True)
        val_data = datasets.CityscapesSeg(root, split='val', dataset=degradedval, transform=True)
        test_data = datasets.CityscapesSeg(root, split='test', dataset=degradedtest, transform=True)
    elif dataset == "Custom":
        train_data = datasets.CustomSeg(root, split='train', dataset=degradedtrain, transform=True)
        val_data = datasets.CustomSeg(root, split='val', dataset=degradedval, transform=True)
        test_data = datasets.CustomSeg(root, split='test', dataset=degradedtest, transform=True)
    else:
        train_data = datasets.SUNSeg(root, split='train', dataset=degradedtrain, transform=True)
        val_data = datasets.SUNSeg(root, split='val', dataset=degradedval, transform=True)
        test_data = datasets.SUNSeg(root, split='test', dataset=degradedtest, transform=True)

    train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=True, **kwargs)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=1, shuffle=False, **kwargs)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False, **kwargs)

    # 2. model
    model = models.FCN8sAtOnce(n_class=train_data.n_classes)
    start_epoch = 0
    start_iteration = 0
    if resume:
        checkpoint = torch.load(resume)
        model.load_state_dict(checkpoint['model_state_dict'])
        start_epoch = checkpoint['epoch']
        start_iteration = checkpoint['iteration']
    else:
        vgg16 = torchfcn.models.VGG16(pretrained=True)
        model.copy_params_from_vgg16(vgg16)
    device = torch.device("cuda" if cuda else "cpu")
    model = model.to(device)

    # 3. optimizer
    optim = torch.optim.SGD(
        [
            {'params': utils.get_parameters(model, bias=False)},
            {'params': utils.get_parameters(model, bias=True),
             'lr': cfg['lr'] * 2, 'weight_decay': 0},
        ],
        lr=cfg['lr'],
        momentum=cfg['momentum'],
        weight_decay=cfg['weight_decay'])
    if resume:
        optim.load_state_dict(checkpoint['optim_state_dict'])

    # 4. trainer
    trainer = Trainer(
        cuda=cuda,
        model=model,
        optimizer=optim,
        train_loader=train_loader,
        val_loader=val_loader,
        test_loader=test_loader,
        out=out,
        max_iter=cfg['max_iteration'],
        interval_validate=cfg.get('interval_validate', len(train_loader)),
    )
    trainer.epoch = start_epoch
    trainer.iteration = start_iteration
    trainer.train()
Beispiel #22
0
parser.add_argument("--no-mem", action="store_true", default=False,
                    help="don't use memory in the model")
args = parser.parse_args()

# Define model name

suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
default_model_name = "{}_{}_seed{}_{}".format(args.env, args.algo, args.seed, suffix)
model_name = args.model or default_model_name

# Define logger and Tensorboard writer and log script arguments

logger = utils.get_logger(model_name)
if args.tb:
    from tensorboardX import SummaryWriter
    writer = SummaryWriter(utils.get_log_dir(model_name))

logger.info("{}\n".format(args))

# Set seed for all randomness sources

utils.seed(args.seed)

# Generate environments

envs = []
for i in range(args.procs):
    env = gym.make(args.env)
    env.seed(args.seed + i)
    envs.append(env)