Exemple #1
0
def eval_joint_model(model, model_params):
    # get test data for evaluation
    data, metadata = gather('10k', 0)

    # build data format
    dformat = ['contexts', 'questions', 'answers']

    accuracies = []
    with tf.Session() as sess:

        # reload model params
        sess.run(set_op(model_params))

        # build trainer
        trainer = Trainer(sess, model, batch_size=128)

        for i in range(1, 21):
            # test feed for task 'i'
            testfeed = DataFeed(dformat, data=data['test'][i])

            # evaluate
            loss, acc = trainer.evaluate(feed=testfeed)

            # note down task accuracy
            accuracies.append(acc)

    print('\n:: Evaluation on individual tasks\n::   Accuracy')
    for i, acc in enumerate(accuracies):
        print(':: \t[{}] {}'.format(i, acc))
Exemple #2
0
def main():


    logger.info("Load  Config")
    data_and_support = CortexEpfl()
    cfg = config.load_config(data_and_support.name)

    logger.info("Initialize Experiment")
    trial_path, trial_id, log_msg = init(cfg)
    logger.info('Experiment ID: {}, Trial ID: {}, GPU: {}'.format(cfg.experiment_idx, trial_id, GPU_index))

    logger.info("Network config")
    model_config = NetworkConfig(cfg.step_count, cfg.first_layer_channels, cfg.num_classes, cfg.num_input_channel, True, cfg.ndims, 'same', trial_id, cfg.batch_size, cfg)

    logger.info("Create network")
    classifier = network(model_config)
    classifier.cuda()

    logger.info("Load data")
    cfg.patch_shape = model_config.in_out_shape(cfg.hint_patch_shape)

    data = data_and_support.load_data(cfg)
    loader = DataLoader(data[DataModes.TRAINING], batch_size=classifier.config.batch_size, shuffle=True)
    logger.info("Trainset length: {}".format(loader.__len__()))

    logger.info("Initialize optimizer")
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, classifier.parameters()), lr=cfg.learning_rate)

    logger.info("Initialize evaluator")
    evaluator = Evaluator(classifier, optimizer, data, trial_path, cfg, data_and_support, cfg.train_mode)

    logger.info("Initialize trainer")
    trainer = Trainer(classifier, loader, optimizer, cfg.numb_of_epochs, cfg.eval_every, trial_path, evaluator, log_msg)

    trainer.train()
Exemple #3
0
def main():
    CUDA_OK = torch.cuda.is_available()
    args = parse()
    dl = DataLoader()
    train_iter, valid_iter = dl.load_translation(
        data_path=args.data_path,
        exts=('.' + args.src, '.' + args.tgt),  # ('.zh', '.en')
        batch_size=args.batch_size,
        dl_save_path=args.dl_path)

    args.n_src_words, args.n_tgt_words = len(dl.SRC.vocab), len(dl.TGT.vocab)
    args.src_pdx, args.tgt_pdx = dl.src_padding_index, dl.tgt_padding_index
    print(args)

    model = build_model(args, cuda_ok=CUDA_OK)
    trainer = Trainer(args,
                      model=model,
                      optimizer=torch.optim.Adam(model.parameters(),
                                                 lr=1e-3,
                                                 betas=(0.9, 0.98),
                                                 eps=1e-9),
                      criterion=nn.CrossEntropyLoss(ignore_index=args.tgt_pdx,
                                                    reduction='mean'),
                      cuda_ok=CUDA_OK)
    trainer.train(train_iter,
                  valid_iter,
                  n_epochs=args.n_epochs,
                  save_path=args.ckpt_path)
def main(model_name):
    # yaml 로드
    config = yaml.load(open("./config/" + str(model_name) + ".yaml", "r"), Loader=yaml.FullLoader)
    trainset = MyTrainSetWrapper(**config['train'])

    # Trainer 클래스 초기화. train 실행.
    downstream = Trainer(trainset, model_name, config)
    downstream.train()
Exemple #5
0
def main():
    manager = Manager.init()

    models = [["model", MobileNetV2(**manager.args.model)]]

    manager.init_model(models)
    args = manager.args
    criterion = Criterion()
    optimizer, scheduler = Optimizer(models, args.optim).init()

    args.cuda = args.cuda and torch.cuda.is_available()
    if args.cuda:
        for item in models:
            item[1].cuda()
        criterion.cuda()

    dataloader = DataLoader(args.dataloader, args.cuda)

    summary = manager.init_summary()
    trainer = Trainer(models, criterion, optimizer, scheduler, dataloader,
                      summary, args.cuda)

    for epoch in range(args.runtime.start_epoch,
                       args.runtime.num_epochs + args.runtime.start_epoch):
        try:
            print("epoch {}...".format(epoch))
            trainer.train(epoch)
            manager.save_checkpoint(models, epoch)

            if (epoch + 1) % args.runtime.test_every == 0:
                trainer.validate()
        except KeyboardInterrupt:
            print("Training had been Interrupted\n")
            break
    trainer.test()
Exemple #6
0
def main():

	args = args_parser()
	if args.task == 'train':
		# conll process
		data_vocab_class, processor_class, conll_config_path = dataset_name_to_class[args.dataset]
		conll_configs = config_loader(conll_config_path)
		if not os.path.exists(os.path.join(conll_configs['data_path'], 'train.txt')):
			data_vocab = data_vocab_class(conll_configs)
			conll_to_train_test_dev(conll_configs['label_file'], conll_configs['data_path'])
		
		# config
		configs = config_loader(args.config_path)
		configs['data_dir'] = os.path.join(configs['data_dir'], args.dataset.lower())
		configs['finetune_model_dir'] = os.path.join(configs['finetune_model_dir'], args.dataset.lower())
		configs['output_dir'] = os.path.join(configs['output_dir'], args.dataset.lower())
		check_dir(configs['data_dir'])
		check_dir(configs['finetune_model_dir'])
		check_dir(configs['output_dir'])
		
		# train
		processor = processor_class()
		for model_class in configs['model_class']:
			print('Begin Training %s Model on corpus %s' %(model_class, args.dataset))
			trainer = Trainer(configs, model_class, processor)
			trainer.train()

	if args.task == 'eval':
		data_vocab_class, processor_class, conll_config_path = dataset_name_to_class[args.dataset]
		conll_configs = config_loader(conll_config_path)
		if not os.path.exists(os.path.join(conll_configs['data_path'], 'test.txt')):
			data_vocab = data_vocab_class(conll_configs)
			conll_to_train_test_dev(conll_configs['label_file'], conll_configs['data_path'])

		configs = config_loader(args.config_path)
		configs['data_dir'] = os.path.join(configs['data_dir'], args.dataset.lower())
		configs['finetune_model_dir'] = os.path.join(configs['finetune_model_dir'], args.dataset.lower())
		configs['output_dir'] = os.path.join(configs['output_dir'], args.dataset.lower())
		check_dir(configs['data_dir'])
		check_dir(configs['finetune_model_dir'])
		check_dir(configs['output_dir'])

		processor = processor_class()
		for model_class in configs['model_class']:
			print('Begin Evaluate %s Model on corpus %s' %(model_class, args.dataset))
			predicter = Predictor(configs, model_class, processor)
			predicter.eval()
Exemple #7
0
def _train(config):
    VALIDATION_SIZE = 5000  # Size of the validation set.
    BATCH_SIZE = 64

    train_dataset = Dataset("mnist", 'train')
    test_dataset = Dataset("mnist", 'test')
    validation_dataset = train_dataset.split_validation(VALIDATION_SIZE)

    model = TutorialCNN(config)

    trainer = Trainer(config, model)
    trainer.set_trainer(optimizer='Momentum', training_size=train_dataset.size)

    # Create a local session to run the training.
    start_time = time.time()
    with tf.Session() as sess:
        # Run all the initializers to prepare the trainable parameters.
        tf.global_variables_initializer().run()
        print('Initialized!')

        for batch in train_dataset.get_batches(BATCH_SIZE,
                                               num_epoches=config.num_epochs):
            step = sess.run(trainer.global_step)
            feed_dict = model.get_feed_dict(batch,
                                            is_train=True,
                                            supervised=True)
            sess.run(trainer.train_op, feed_dict=feed_dict)
            if step % config.eval_period == 0:
                l, lr, acc = sess.run(
                    [model.loss, trainer.learning_rate, model.accuracy],
                    feed_dict=feed_dict)
                elapsed_time = time.time() - start_time
                start_time = time.time()
                print('Step %d, %.1f ms' %
                      (step, 1000 * elapsed_time / config.eval_period))
                print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
                print('Minibatch error: %.1f%%' % (100. - 100. * acc, ))
                print('Validation error: %.1f%%' % eval_error_in_dataset(
                    validation_dataset, model, sess, config))
                sys.stdout.flush()

        test_error = eval_error_in_dataset(test_dataset, model, sess, config)
        print('Test error: %.1f%%' % test_error)
Exemple #8
0
def run(model_cls, loss, predictor, acc_calc, train_dataset, valid_dataset,
        sps):

    # log setting
    alias = generate_alias(model_cls, task='Age')
    msg = generate_file_msg(sps, loss, predictor, acc_calc)
    tb_log_path = os.path.join('runs', alias)
    save_dir = os.path.join('models', alias)
    logger_alias = alias

    config = Config(epoch_num=sps['epoch_num'],
                    momentum=sps['momentum'],
                    weight_decay=sps['weight_decay'],
                    learning_rates=sps['learning_rates'],
                    decay_points=sps['decay_points'],
                    batch_size=sps['batch_size'],
                    parameters_func=parameters_func,
                    tb_log_path=tb_log_path,
                    save_dir=save_dir,
                    pretrain=sps['pretrain'],
                    pretrained_model_dir=sps['pretrained_model_dir'],
                    load_function=sps['load_function'],
                    logger_alias=logger_alias,
                    gpu_id=gpu_id)

    logger = Logger()
    logger.open_file(os.path.join('log'),
                     alias=alias,
                     file_name=alias + '.txt',
                     file_msg=msg)

    trainer = Trainer(model_cls=model_cls,
                      loss=loss,
                      predictor=predictor,
                      calculator=calculator,
                      train_dataset=train_dataset,
                      val_dataset=valid_dataset,
                      config=config,
                      logger=logger)

    trainer.train()
    logger.close_file(alias)
def train(data,config):

    id2label=data['id2label']
    label_size=data['label_size']
    config['num_labels']=label_size

    model=Classifier(config)(num_labels=label_size)

    optimizer = Optim(config.optim, config.learning_rate, config.max_grad_norm,
                         lr_decay=config.learning_rate_decay, start_decay_at=config.start_decay_at)
    optimizer.set_parameters(model.parameters())

    if config.classifier=='BertCNN' or config.classifier=='BertRCNN' or config.classifier=='BertDPCNN' or config.classifier=='BertFC':
        trainer = Trainer(config,
            model=model,
            logger=logger,
            criterion=BCEWithLogLoss(),
            optimizer=optimizer,
            early_stopping=None,
            epoch_metrics=[AUC(average='micro', task_type='binary'),MultiLabelReport(id2label=id2label),F1Score(average='micro')])
    elif config.classifier=='BertSGM' or config.classifier=='SGM':
        criterion = nn.CrossEntropyLoss(ignore_index=dict_helper.PAD, reduction='none')
        if config.n_gpu!='':
            criterion.cuda()
        trainer = Trainer(config,
                          model=model,
                          logger=logger,
                          criterion=criterion,
                          optimizer=optimizer,
                          early_stopping=None,
                          epoch_metrics=[AUC(average='micro', task_type='binary'),
                                         F1Score(average='micro')])
    elif config.classifier=='BertSeq2Set':
         trainer = Trainer(config,
                          model=model,
                          logger=logger,
                          criterion=None,
                          optimizer=optimizer,
                          early_stopping=None,
                          epoch_metrics=[AUC(average='micro', task_type='binary'),F1Score(average='micro')])
                          
    trainer.train(data=data,seed=config.seed)
Exemple #10
0
  def run(self):
    device = "/cpu:0"
    if USE_GPU:
      device = "/gpu:0"

    self.print_flags_info()

    if flags.segnet == -1:
      with open(flags.segnet_config) as f:
        self.config = json.load(f)

      self.num_classes = self.config["NUM_CLASSES"]
      self.use_vgg = self.config["USE_VGG"]

      if self.use_vgg is False:
        self.vgg_param_dict = None
        print("No VGG path in config, so learning from scratch")
      else:
        self.vgg16_npy_path = self.config["VGG_FILE"]
        self.vgg_param_dict = np.load(self.vgg16_npy_path, encoding='latin1').item()
        print("VGG parameter loaded")

      self.bayes = self.config["BAYES"]
      segnet_param_dict = {'segnet_mode': flags.segnet, 'vgg_param_dict': self.vgg_param_dict, 'use_vgg': self.use_vgg,
                       'num_classes': self.num_classes, 'bayes': self.bayes}
    else: # 0, 1, 2, 3
      segnet_param_dict = {'segnet_mode': flags.segnet}

    if flags.env_type != 'indoor':
        env_config = {}
    else:
        env_config = sim_config.get(flags.env_name)
    self.image_shape = [env_config.get('height', 84), env_config.get('width', 84)]
    self.map_file = env_config.get('objecttypes_file', '../../objectTypes_1x.csv')
    
    initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                        flags.initial_alpha_high,
                                        flags.initial_alpha_log_rate)
    self.global_t = 0
    
    self.stop_requested = False
    self.terminate_requested = False
    
    action_size = Environment.get_action_size(flags.env_type,
                                              flags.env_name)
    objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)

    is_training = tf.placeholder(tf.bool, name="training")

    self.random_state = np.random.RandomState(seed=env_config.get("seed", 0xA3C))

    print("Global network initializing!")#, flush=True)

    self.global_network = UnrealModel(action_size,
                                      objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      flags.pixel_change_lambda,
                                      flags.entropy_beta,
                                      device,
                                      segnet_param_dict=segnet_param_dict,
                                      image_shape=self.image_shape,
                                      is_training=is_training,
                                      n_classes=flags.n_classes,
                                      segnet_lambda=flags.segnet_lambda,
                                      dropout=flags.dropout)
    self.trainers = []
    
    learning_rate_input = tf.placeholder("float")

    
    grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                  decay = flags.rmsp_alpha,
                                  momentum = 0.0,
                                  epsilon = flags.rmsp_epsilon,
                                  clip_norm = flags.grad_norm_clip,
                                  device = device)
    
    for i in range(flags.parallel_size):
      trainer = Trainer(i,
                        self.global_network,
                        initial_learning_rate,
                        learning_rate_input,
                        grad_applier,
                        flags.env_type,
                        flags.env_name,
                        flags.use_lstm,
                        flags.use_pixel_change,
                        flags.use_value_replay,
                        flags.use_reward_prediction,
                        flags.pixel_change_lambda,
                        flags.entropy_beta,
                        flags.local_t_max,
                        flags.n_step_TD,
                        flags.gamma,
                        flags.gamma_pc,
                        flags.experience_history_size,
                        flags.max_time_step,
                        device,
                        segnet_param_dict=segnet_param_dict,
                        image_shape=self.image_shape,
                        is_training=is_training,
                        n_classes = flags.n_classes,
                        random_state=self.random_state,
                        termination_time=flags.termination_time_sec,
                        segnet_lambda=flags.segnet_lambda,
                        dropout=flags.dropout)
      self.trainers.append(trainer)


    self.last_scores = []
    self.best_score = -1.0
    
    # prepare session
    config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False)
    config.gpu_options.allow_growth = True
    self.sess = tf.Session(config=config)

    # Wrap sess.run for debugging messages!
    def run_(*args, **kwargs):
      #print(">>> RUN!", args[0] if args else None)#, flush=True)
      return self.sess.__run(*args, **kwargs)  # getattr(self, "__run")(self, *args, **kwargs)
    self.sess.__run, self.sess.run = self.sess.run, run_

    self.sess.run(tf.global_variables_initializer())

    # summary for tensorboard
    self.score_input = tf.placeholder(tf.float32)
    self.sr_input = tf.placeholder(tf.float32)
    self.mIoU_input = tf.placeholder(tf.float32)
    self.term_global_t = tf.placeholder(tf.int32)

    self.losses_input = {}

    self.total_loss = tf.placeholder(tf.float32,  name='total_loss')
    self.losses_input.update({'all/total_loss': self.total_loss})

    self.base_loss = tf.placeholder(tf.float32, name='base_loss')
    self.losses_input.update({'all/base_loss': self.base_loss})

    self.policy_loss = tf.placeholder(tf.float32,  name='policy_loss')
    self.losses_input.update({'all/policy_loss': self.policy_loss})

    self.value_loss = tf.placeholder(tf.float32, name='policy_loss')
    self.losses_input.update({'all/value_loss': self.value_loss})

    self.grad_norm = tf.placeholder(tf.float32, name='grad_norm')
    self.losses_input.update({'all/loss/grad_norm': self.grad_norm})

    self.entropy_input = tf.placeholder(tf.float32, shape=[None], name='entropy')

    if segnet_param_dict["segnet_mode"] >= 2:
      self.decoder_loss = tf.placeholder(tf.float32,  name='decoder_loss')
      self.losses_input.update({'all/decoder_loss': self.decoder_loss})
      self.l2_weights_loss = tf.placeholder(tf.float32, name='regul_weights_loss')
      self.losses_input.update({'all/l2_weights_loss': self.l2_weights_loss})
    if flags.use_pixel_change:
      self.pc_loss = tf.placeholder(tf.float32,  name='pc_loss')
      self.losses_input.update({'all/pc_loss': self.pc_loss})
    if flags.use_value_replay:
      self.vr_loss = tf.placeholder(tf.float32,  name='vr_loss')
      self.losses_input.update({'all/vr_loss': self.vr_loss})
    if flags.use_reward_prediction:
      self.rp_loss = tf.placeholder(tf.float32,  name='rp_loss')
      self.losses_input.update({'all/rp_loss': self.rp_loss})

    score_summary = tf.summary.scalar("all/eval/score", self.score_input)
    sr_summary = tf.summary.scalar("all/eval/success_rate", self.sr_input)
    term_summary = tf.summary.scalar("all/eval/term_global_t", self.term_global_t)
    eval_summary = tf.summary.scalar("all/eval/mIoU_all", self.mIoU_input)
    losses_summary_list = []
    for key, val in self.losses_input.items():
      losses_summary_list += [tf.summary.scalar(key, val)]


    self.summary_op_dict = {'score_input': score_summary, 'eval_input': eval_summary, 'sr_input':sr_summary,
                            'losses_input': tf.summary.merge(losses_summary_list),
                            'entropy': tf.summary.scalar('all/eval/entropy_stepTD', tf.reduce_mean(self.entropy_input)),
                            'term_global_t': term_summary}
    flags.checkpoint_dir = os.path.join(base_dir, flags.checkpoint_dir)
    #print("First dirs {}::{}".format(flags.log_dir, flags.checkpoint_dir))
    flags.checkpoint_dir = flags.checkpoint_dir
    print("Checkpoint dir: {}, Log dir: {}".format(flags.checkpoint_dir, flags.log_dir))
    overall_FW = tf.summary.FileWriter(os.path.join(flags.log_dir, 'overall'),
                          self.sess.graph)
    self.summary_writer = [(tf.summary.FileWriter(os.path.join(flags.log_dir, 'worker_{}'.format(i)),
                                                self.sess.graph), overall_FW) for i in range(flags.parallel_size)]
    
    # init or load checkpoint with saver
    self.saver = tf.train.Saver(self.global_network.get_global_vars(), max_to_keep=20)


    
    #checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir, latest_filename ="best-checkpoint")
    #if checkpoint is None or checkpoint.model_checkpoint_path is None:
    #  checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
    checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)

    if checkpoint and checkpoint.model_checkpoint_path:
      if flags.segnet == -1:
          from tensorflow.python import pywrap_tensorflow
          reader = pywrap_tensorflow.NewCheckpointReader(checkpoint.model_checkpoint_path)
          big_var_to_shape_map = reader.get_variable_to_shape_map()
          s = []
          for key in big_var_to_shape_map:
              s += [key]
              # print("tensor_name: ", key)
          glob_var_names = [v.name for v in tf.global_variables()]
          endings = [r.split('/')[-1][:-2] for r in glob_var_names]
          old_ckpt_to_new_ckpt = {[k for k in s if endings[i] in k][0]: v for i, v in enumerate(tf.global_variables())}
          saver1 = tf.train.Saver(var_list=old_ckpt_to_new_ckpt)
          saver1.restore(self.sess, checkpoint.model_checkpoint_path)
      else:
          self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
      print("checkpoint loaded:", checkpoint.model_checkpoint_path)
      tokens = checkpoint.model_checkpoint_path.split("-")
      # set global step
      if 'best' in checkpoint.model_checkpoint_path:
          files = os.listdir(flags.checkpoint_dir)
          max_g_step = 0
          max_best_score = -10
          for file in files:
            if '.meta' not in file or 'checkpoint' not in file:
              continue
            if len(tokens) == 2:
              continue
            if len(tokens) > 3:
              best_score = float('-0.'+file.split('-')[2]) if 'best' in file else float('-0.'+file.split('-')[1])
              if best_score > max_best_score:
                g_step = int(file.split('-')[3]).split('.')[0] if 'best' in file else int(file.split('-')[2].split('.')[0])
                if max_g_step < g_step:
                  max_g_step = g_step
            else:
              self.best_score = -1.0
              g_step = int(file.split('-')[2]) if 'best' in file else int(file.split('-')[1])
              if max_g_step < g_step:
                max_g_step = g_step
          self.best_score = max_best_score
          self.global_t = max_g_step
          print("Chosen g_step >>", g_step)
      else:
        if len(tokens) == 3:
          self.global_t = int(tokens[2])
        else:
          self.global_t = int(tokens[1])
      #for i in range(flags.parallel_size):
      #  self.trainers[i].local_t = self.global_t
      print(">>> global step set: ", self.global_t)
      # set wall time
      wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t)
      with open(wall_t_fname, 'r') as f:
        self.wall_t = float(f.read())
        self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step
      print_tensors_in_checkpoint_file(file_name=checkpoint.model_checkpoint_path,
                                     tensor_name='', all_tensors=False, all_tensor_names=True)
    else:
      print("Could not find old checkpoint")
      # set wall time
      self.wall_t = 0.0
      self.next_save_steps = flags.save_interval_step
    print("Global step {}, max best score {}".format(self.global_t, self.best_score))

    if flags.segnet_pretrain:
      checkpoint_dir = "../erfnet_segmentation/models"
      checkpoint_dir = os.path.join(checkpoint_dir, "aug_erfnetC_0_{}x{}_{}x/snapshots_best".format(
          self.image_shape[1],
          self.image_shape[0],
          self.map_file.split('_')[1].split('x')[0]))
      checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)

      big_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_encoder')
      big_weights += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_decoder')

      erfnet_weights = [l.name.split(':')[0].rsplit('net_-1/base_encoder/')[-1] for l in big_weights
             if len(l.name.split(':')[0].rsplit('net_-1/base_encoder/')) == 2]
      erfnet_weights += [l.name.split(':')[0].rsplit('net_-1/base_decoder/')[-1] for l in big_weights
              if len(l.name.split(':')[0].rsplit('net_-1/base_decoder/')) == 2]

      if checkpoint and checkpoint.model_checkpoint_path:
          saver2 = tf.train.Saver(var_list=dict(zip(erfnet_weights, big_weights)))
          saver2.restore(self.sess, checkpoint.model_checkpoint_path)
          print("ErfNet pretrained weights restored from file ", checkpoint_dir)
      else:
          print("Can't load pretrained weights for ErfNet from file ", checkpoint_dir)

    # run training threads
    self.train_threads = []
    for i in range(flags.parallel_size):
      self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True)))
      
    signal.signal(signal.SIGINT, self.signal_handler)
  
    # set start time
    self.start_time = time.time() - self.wall_t

    print("Ready to start")
    for t in self.train_threads:
      t.start()
  
    print('Press Ctrl+C to stop')#, flush=True)
    signal.pause()
Exemple #11
0
def main(_):
    options = get_options_from_flags()
    maybe_download_data_files_from_s3(options)
    Trainer(options).train()
from utils.dataloader import DataLoader
import torch
from model import model_utils
from optimizer.optimizer import NoamOpt
from train.trainer import Trainer

hidden_size = 256
num_encoder = 6
num_decoder = 6
n_head = 8
pf_dim = 1024
drop_out = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

dataloader = DataLoader(device)
train_iterator, valid_iterator, test_iterator = dataloader.load_data(64)
model = model_utils.create_model(dataloader.src_vocab_size(), dataloader.trg_vocab_size(), hidden_size, num_encoder, num_decoder, n_head, pf_dim,
                                 drop_out, dataloader.get_pad_idx(), device)

print(model_utils.count_parameters(model))
model_utils.init(model)
optimizer = NoamOpt(hidden_size , 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

trainer = Trainer(train_iterator, valid_iterator, model, optimizer, dataloader.get_pad_idx(), device)
trainer.train(5)
# for i, batch in enumerate(train_iterator):
#     src = batch.src.permute(1, 0).to(device)
#     trg = batch.trg.permute(1, 0).to(device)
Exemple #13
0
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"

        initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                            flags.initial_alpha_high,
                                            flags.initial_alpha_log_rate)

        self.global_t = 0

        self.stop_requested = False
        self.terminate_reqested = False

        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)

        self.global_network = UnrealModel(action_size, -1,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          flags.pixel_change_lambda,
                                          flags.entropy_beta, device)
        self.trainers = []

        learning_rate_input = tf.placeholder("float")

        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=flags.rmsp_alpha,
                                      momentum=0.0,
                                      epsilon=flags.rmsp_epsilon,
                                      clip_norm=flags.grad_norm_clip,
                                      device=device)

        for i in range(
                flags.parallel_size):  #Trainer creates a UnrealModel in init
            trainer = Trainer(i, self.global_network, initial_learning_rate,
                              learning_rate_input, grad_applier,
                              flags.env_type, flags.env_name,
                              flags.use_pixel_change, flags.use_value_replay,
                              flags.use_reward_prediction,
                              flags.pixel_change_lambda, flags.entropy_beta,
                              flags.local_t_max, flags.gamma, flags.gamma_pc,
                              flags.experience_history_size,
                              flags.max_time_step, device)
            self.trainers.append(trainer)

        # prepare session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        # summary for tensorboard
        self.score_input = tf.placeholder(tf.int32)
        tf.summary.scalar("score", self.score_input)

        self.summary_op = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(flags.log_file,
                                                    self.sess.graph)

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: ", self.global_t)
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step

        else:
            print("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        # run training threads  ## Each Env is Running Here Parallel
        self.train_threads = []
        for i in range(flags.parallel_size):
            self.train_threads.append(
                threading.Thread(target=self.train_function, args=(i, True)))

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t

        for t in self.train_threads:
            t.start()

        print('Press Ctrl+C to stop')
        signal.pause()
Exemple #14
0
def setup_trainer(config,
                  name,
                  datasets,
                  monitor="val",
                  pretrained=None,
                  finetune=False,
                  label_transformer=None,
                  disable_cache=False):
    """
		Prepare everything needed for a train + validation typical pipeline.

	Args:
		config:	dict, experiment parameters
		name:	str, name of the experiment
		datasets: dict, data for every data partition (X, y)
		monitor: str, partition to watch on learning time 
		pretrained: str, path to pretrained model and conf files
		finetune: bool, whether to finetune pretrained model
		label_transformer: Label transform function
		disable_cache: Whether to activate caching (TODO)

	Return:
		a Trainer object
	"""
    pretrained_model = None
    pretrained_config = None
    if pretrained:
        pretrained_model, pretrained_config = get_pretrained(pretrained)

    word2idx = None
    embeddings = None
    if config["embeddings_file"]:
        word2idx, idx2word, embeddings = load_embeddings(config)

    preprocessor = config["preprocessor"]
    try:
        preprocessor = getattr(preproc, preprocessor)
    except TypeError:
        preprocessor = preproc.dummy_preprocess

    loaders = get_dataloaders(datasets,
                              batch_size=config["batch_size"],
                              data_type=config["data_type"],
                              name=name,
                              preprocessor=preprocessor(),
                              label_transformer=label_transformer,
                              word2idx=word2idx,
                              config=config)

    output_size, task = get_output_size(loaders["train"].dataset.labels)
    weights = None
    if task != "regression":
        weights = class_weights(loaders["train"].dataset.labels,
                                to_pytorch=True).to(DEVICE)

    if embeddings is None:
        model = MonoModalModel(
            out_size=output_size,
            embeddings=embeddings,
            embed_dim=config["embeddings_size"],
            pretrained=pretrained_model,
            finetune=finetune,
            encoder_params=config["encoder_params"],
            attention_params=config["attention_params"]).to(DEVICE)
    else:
        model = MonoModalModel(
            out_size=output_size,
            embeddings=embeddings,
            pretrained=pretrained_model,
            finetune=finetune,
            embed_params=config["embeddings_params"],
            encoder_params=config["encoder_params"],
            attention_params=config["attention_params"]).to(DEVICE)

    criterion = get_criterion(task, weights)
    parameters = filter(lambda p: p.requires_grad, model.parameters())

    optimizer = get_optimizer(parameters,
                              lr=config["lr"],
                              weight_decay=config["weight_decay"])

    pipeline = get_pipeline(task, criterion)

    metrics, monitor_metric, mode = get_metrics(task)

    model_dir = None
    if pretrained:
        model_dir = os.path.join(TRAINED_PATH, "TL")

    checkpoint = Checkpoint(name=name,
                            model=model,
                            model_conf=config,
                            monitor=monitor,
                            keep_best=True,
                            timestamp=True,
                            scorestamp=True,
                            metric=monitor_metric,
                            mode=mode,
                            base=config["base"],
                            model_dir=model_dir)

    early_stopping = EarlyStop(metric=monitor_metric,
                               mode=mode,
                               monitor=monitor,
                               patience=config["patience"],
                               min_change=config["min_change"])

    return Trainer(model=model,
                   loaders=loaders,
                   task=task,
                   config=config,
                   optimizer=optimizer,
                   pipeline=pipeline,
                   metrics=metrics,
                   checkpoint=checkpoint,
                   early_stopping=early_stopping)
from __future__ import print_function, division
from skimage import transform
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torchvision import transforms
from dataset.screendataset import ScreenDataset as ScreenDataset
from dataset.transforms import Rescale
from dataset.transforms import ToTensor
from train.trainer import Trainer


def show_landmarks(screendataset):
    plt.imshow(screendataset.__getitem__(0)['image'])


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--load', type=bool, default=False)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--max', type=int, default=100000)
    parser.add_argument('--epochs', type=int, default=5)
    args = parser.parse_args()
    print("load=", args.load, "lr=", args.lr, "max=", args.max, "epochs=",
          args.epochs)
    trainer = Trainer(args.load, args.lr, args.epochs, args.max)
    trainer.train_model()
                  n=1,
                  optimizer=tf.train.AdamOptimizer,
                  lr=0.001,
                  vocab_size=metadata['vocab_size'],
                  max_candidates=metadata['max_candidates'],
                  demb=384,
                  dhdim=384,
                  num_layers=1)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:

        # init session
        sess.run(tf.global_variables_initializer())

        # create trainer
        trainer = Trainer(sess,
                          model,
                          trainfeed,
                          testfeed,
                          batch_size=batch_size)

        # train
        acc = trainer.fit(epochs=1000000,
                          eval_interval=1,
                          mode=Trainer.TRAIN,
                          verbose=True,
                          lr=0.001)

        print(':: \tAccuracy after training: ', acc)
"""Run script
Author: Alaaeldin Ali"""

from train.trainer import Trainer
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--lr', default=2.5e-4)
parser.add_argument('--vis_screen', default='Relnet')
parser.add_argument('--save_path', default=None)
parser.add_argument('-warmup', action='store_true')
parser.add_argument('--batch_size', default=64)
args = parser.parse_args()
trainer = Trainer(lr=args.lr,
                  screen=args.screen,
                  batch_size=args.batch_size,
                  save_path=args.save_path,
                  warmup=args.warmup)
trainer.train()
Exemple #18
0
    # make 'n' copies of model for data parallelism
    make_parallel(model, num_copies=4, num_gpus=4)

    # setup visualizer
    #  by default, writes to ./log/
    vis = Visualizer(interval=50)
    vis.attach_scalars(model)
    vis.attach_params() # histograms of trainable variables


    # create data source (SQuAD)
    datasrc = DataSource(batch_size, 
            glove_file='../../../datasets/glove/glove.6B.300d.txt', 
            random_x=0.2)

    # gpu config
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        # init session
        sess.run(tf.global_variables_initializer())

        vis.attach_graph(sess.graph)

        # init trainer
        trainer = Trainer(sess, model, datasrc, batch_size, rand=True)

        # fit model
        trainer.fit(epochs=1000, visualizer=vis)
def train(args):
    ########### data ###########
    processor = Postprocessor(
        config["postprocessor"])(do_lower_case=args.do_lower_case)
    label_list = processor.get_labels(config['data_dir'] / "labels.txt")
    id2label = {i: label for i, label in enumerate(label_list)}

    train_data = processor.get_train(config['data_dir'] /
                                     "{}.train.pkl".format(args.data_name))
    train_examples = processor.create_examples(
        lines=train_data,
        example_type='train',
        cached_examples_file=config["data_dir"] /
        "cached_train_examples_{}".format(args.pretrain))
    train_features = processor.create_features(
        examples=train_examples,
        max_seq_len=args.train_max_seq_len,
        cached_features_file=config["data_dir"] /
        "cached_train_features_{}_{}".format(args.train_max_seq_len,
                                             args.pretrain))

    train_dataset = processor.create_dataset(train_features,
                                             is_sorted=args.sorted)
    if args.sorted:
        train_sampler = SequentialSampler(train_dataset)
    else:
        train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    valid_data = processor.get_dev(config["data_dir"] /
                                   "{}.valid.pkl".format(args.data_name))
    valid_examples = processor.create_examples(
        lines=valid_data,
        example_type='valid',
        cached_examples_file=config["data_dir"] /
        "cached_valid_examples_{}".format(args.pretrain))
    valid_features = processor.create_features(
        examples=valid_examples,
        max_seq_len=args.eval_max_seq_len,
        cached_features_file=config["data_dir"] /
        "cached_valid_features_{}_{}".format(args.eval_max_seq_len,
                                             args.pretrain))
    valid_dataset = processor.create_dataset(valid_features)
    valid_sampler = SequentialSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=valid_sampler,
                                  batch_size=args.eval_batch_size)

    if config["pretrain"] == "Nopretrain":
        config["vocab_size"] = processor.vocab_size

    ########### model ###########
    logger.info("=========  initializing model =========")
    if args.resume_path:
        resume_path = Path(args.resume_path)
        model = Classifier(config["classifier"], config["pretrain"],
                           resume_path)(num_labels=len(label_list))
    else:
        model = Classifier(config["classifier"], config["pretrain"],
                           "")(num_labels=len(label_list))

    t_total = int(
        len(train_dataloader) / args.gradient_accumulation_steps * args.epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(t_total * args.warmup_proportion)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    lr_scheduler = WarmupLinearSchedule(optimizer,
                                        warmup_steps=warmup_steps,
                                        t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError as e:
            raise ImportError(
                "Please install apex github.com/nvidia/apex to use fp16.")
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    ########### callback ###########
    logger.info("========= initializing callbacks =========")
    train_monitor = TrainingMonitor(file_dir=config['figure_dir'],
                                    arch=args.pretrain)
    model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'],
                                       mode=args.mode,
                                       monitor=args.monitor,
                                       arch=args.pretrain,
                                       save_best_only=args.save_best)

    ########### train ###########
    logger.info("========= Running training =========")
    logger.info("  Num examples = {}".format(len(train_examples)))
    logger.info("  Num Epochs = {}".format(args.epochs))
    logger.info("  Total train batch size \
        (w. parallel, distributed & accumulation) = {}".format(
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
    logger.info("  Gradient Accumulation steps = {}".format(
        args.gradient_accumulation_steps))
    logger.info("  Total optimization steps = {}".format(t_total))

    trainer = Trainer(
        n_gpu=args.n_gpu,
        model=model,
        epochs=args.epochs,
        logger=logger,
        criterion=BCEWithLogLoss(),
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        early_stopping=None,
        training_monitor=train_monitor,
        fp16=args.fp16,
        resume_path=args.resume_path,
        grad_clip=args.grad_clip,
        model_checkpoint=model_checkpoint,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        batch_metrics=[AccuracyThresh(thresh=0.5)],
        epoch_metrics=[
            AUC(average='micro', task_type='binary'),
            MultiLabelReport(id2label=id2label)
        ])
    trainer.train(train_data=train_dataloader,
                  valid_data=valid_dataloader,
                  seed=args.seed)
Exemple #20
0
    print(X_train.shape)
    y_train = to_one_hot(y_train)
    test_size = 10000
    #test_mask = np.random.choice(X_train.shape[0], test_size)

    X_test = X_train[0:test_size]
    y_test = y_train[0:test_size]
    X_train = X_train[test_size:]
    y_train = y_train[test_size:]

    network = DeepConvNet()
    trainer = Trainer(network,
                      X_train,
                      y_train,
                      X_test,
                      y_test,
                      epochs=100,
                      mini_batch_size=100,
                      optimizer='adagrad',
                      optimizer_param={'lr': 0.01},
                      evaluate_sample_num_per_epoch=1000)
    trainer.train()

    y_pred = network.predict(X_pred)
    print(y_pred.shape)
    y_pred = one_hot_to(y_pred)
    submission = pd.DataFrame(y_pred, columns=["label"])

    submission.index += 1
    submission.to_csv("submission.csv", index_label='id')
Exemple #21
0
from config import cifar_configs
from train.trainer import Trainer
from data.data_handler import DataHandler

if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--config-file", default = "./config/train_config.yaml", metavar = "FILE", type = str)
    # args = parser.parse_args()

    # #extract config
    # config_file = open(args.config_file, 'r')
    # configs = yaml.load(config_file)
    datahandler = DataHandler(cifar_configs)
    trainer = Trainer(cifar_configs, datahandler)
    trainer.train()
Exemple #22
0
    monitor="val_loss",
    save_best_only=False,
    best_model_name=BEST_MODEL_NAME,
    epoch_model_name=EPOCH_MODEL_NAME,
    arch=ARCH,
    logger=logger,
)

trainer = Trainer(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    device=device,
    n_gpus=len(MULTI_GPUS),
    criterion=TripletLoss_op(),
    fts_flag=False,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    model_checkpoint=model_checkpoint,
    logger=logger,
    resume=RESUME,
)

trainer.summary()
trainer.train()

logger.info(f'Took {time_to_str((timer() - start_time), "sec")}')

print("---------- Bert Eval ... ----------")
start_time = timer()
Exemple #23
0
def train_separate_all(dataset='1k'):

    batch_size = 64

    task_max_acc = []
    for task in range(1, 21):
        # get task 1
        #task = 18
        data, metadata = gather('1k', task)

        # gather info from metadata
        num_candidates = metadata['candidates']['vocab_size']
        vocab_size = metadata['vocab_size']
        memsize = metadata['clen']
        sentence_size = metadata['slen']
        qlen = metadata['qlen']

        print(':: <task {}> memory size : {}'.format(task, memsize))

        # build data format
        dformat = ['contexts', 'questions', 'answers']

        # create feeds
        trainfeed = DataFeed(dformat, data=data['train'])
        testfeed = DataFeed(dformat, data=data['test'])

        # instantiate model
        model = MemoryNet(hdim=20,
                          num_hops=3,
                          memsize=memsize,
                          sentence_size=sentence_size,
                          qlen=qlen,
                          vocab_size=vocab_size,
                          num_candidates=num_candidates)

        with tf.Session() as sess:
            # run for multiple initializations
            i, accuracy = 0, [0.]
            while accuracy[-1] < 0.95 and i < 5:
                # init session
                sess.run(tf.global_variables_initializer())

                # create trainer
                trainer = Trainer(sess,
                                  model,
                                  trainfeed,
                                  testfeed,
                                  batch_size=batch_size)

                print('\n:: <task {}> ({}) [1/2] Pretraining'.format(task, i))
                # pretrain
                acc = trainer.fit(epochs=100000,
                                  eval_interval=1,
                                  mode=Trainer.PRETRAIN,
                                  verbose=False,
                                  batch_size=64,
                                  lr=0.0005)
                print(':: \tAccuracy after pretraining: ', acc)

                print('\n:: <task {}> ({}) [2/2] Training'.format(task, i))
                # train
                acc = trainer.fit(epochs=1000000,
                                  eval_interval=10,
                                  mode=Trainer.TRAIN,
                                  verbose=False,
                                  batch_size=64,
                                  lr=0.0005)
                print(':: \tAccuracy after training: ', acc)

                # next iteration
                i += 1
                # add accuracy to list
                accuracy.append(acc)
                print(acc)

            print('Experiment Results : ')
            for i, a in enumerate(accuracy[1:]):
                print(i, a)

        task_max_acc.append(max(accuracy))

    print('____________________________________________')
    for i, acc in enumerate(task_max_acc):
        print('Task ({}) : {}'.format(i + 1, acc))
    print('____________________________________________')
Exemple #24
0
import argparse

from train.trainer import Trainer
from utils.base_utils import load_cfg

parser = argparse.ArgumentParser()
parser.add_argument('--cfg',
                    type=str,
                    default='configs/lmcnet/lmcnet_sift_outdoor_test.yaml')
flags = parser.parse_args()

trainer = Trainer(load_cfg(flags.cfg))
trainer.run()
Exemple #25
0
def train_separate(task, dataset='1k', iterations=1, batch_size=32):

    # get data for task
    data, metadata = gather(dataset, task)

    # build data format
    dformat = ['contexts', 'questions', 'answers']

    # create feeds
    trainfeed = DataFeed(dformat, data=data['train'])
    testfeed = DataFeed(dformat, data=data['test'])

    hdim = 20 if task else 50
    eval_interval = 100 if task else 10
    batch_size = 32 if task else 128

    # instantiate model
    model = MemoryNet(hdim=20,
                      num_hops=3,
                      memsize=metadata['clen'],
                      sentence_size=metadata['slen'],
                      qlen=metadata['qlen'],
                      vocab_size=metadata['vocab_size'],
                      num_candidates=metadata['candidates']['vocab_size'])

    # info
    print(':: <task {}> [0/2] Info')
    print(':: \t memory size : {}, #candidates : {}'.format(
        metadata['clen'], metadata['candidates']['vocab_size']))

    with tf.Session() as sess:
        # run for multiple initializations
        i, accuracy, model_params = 0, [0.], [None]
        while accuracy[-1] < 0.95 and i < iterations:
            # init session
            sess.run(tf.global_variables_initializer())

            # create trainer
            trainer = Trainer(sess,
                              model,
                              trainfeed,
                              testfeed,
                              batch_size=batch_size)

            print('\n:: <task {}> ({}) [1/2] Pretraining'.format(task, i))
            # pretrain
            acc = trainer.fit(epochs=100000,
                              eval_interval=1,
                              mode=Trainer.PRETRAIN,
                              verbose=False,
                              lr=0.0005)

            print(':: \tAccuracy after pretraining: ', acc)

            print('\n:: <task {}> ({}) [2/2] Training'.format(task, i))
            # train
            acc = trainer.fit(epochs=1000000,
                              eval_interval=eval_interval,
                              mode=Trainer.TRAIN,
                              verbose=False,
                              lr=0.0005)

            print(':: \tAccuracy after training: ', acc)

            # next iteration
            i += 1
            # add accuracy to list
            accuracy.append(acc)
            model_params.append(sess.run(tf.trainable_variables()))
            print(acc)

        print(':: [x/x] End of training')
        print(':: Max accuracy :', max(accuracy))

        # return model and best model params
        return model, model_params[accuracy.index(max(accuracy))]
    def __init__(self,
                 ms_config_p, dl_config_p,
                 log_dir_root, log_config: LogConfig,
                 num_workers,
                 saver: Saver, restorer: TrainRestorer=None,
                 sw_cls=vis.safe_summary_writer.SafeSummaryWriter):
        """
        :param ms_config_p: Path to the multiscale config file, see README
        :param dl_config_p: Path to the dataloader config file, see README
        :param log_dir_root: All outputs (checkpoints, tensorboard) will be saved here.
        :param log_config: Instance of train.trainer.LogConfig, contains intervals.
        :param num_workers: Number of workers to use for DataLoading, see train.py
        :param saver: Saver instance to use.
        :param restorer: Instance of TrainRestorer, if we need to restore
        """

        # Read configs
        # config_ms = config for the network (ms = multiscale)
        # config_dl = config for data loading
        (self.config_ms, self.config_dl), rel_paths = ft.unzip(map(config_parser.parse, [ms_config_p, dl_config_p]))
        # Update config_ms depending on global_config
        global_config.update_config(self.config_ms)
        # Create data loaders
        dl_train, dl_val = self._get_dataloaders(num_workers)
        # Create blueprint. A blueprint collects the network as well as the losses in one class, for easy reuse
        # during testing.
        self.blueprint = MultiscaleBlueprint(self.config_ms)
        print('Network:', self.blueprint.net)
        # Setup optimizer
        optim_cls = {'RMSprop': optim.RMSprop,
                     'Adam': optim.Adam,
                     'SGD': optim.SGD,
                     }[self.config_ms.optim]
        net = self.blueprint.net
        self.optim = optim_cls(net.parameters(), self.config_ms.lr.initial,
                               weight_decay=self.config_ms.weight_decay)
        # Calculate a rough estimate for time per batch (does not take into account that CUDA is async,
        # but good enought to get a feeling during training).
        self.time_accumulator = timer.TimeAccumulator()
        # Restore network if requested
        skip_to_itr = self.maybe_restore(restorer)
        if skip_to_itr is not None:  # i.e., we have a restorer
            print('Skipping to {}...'.format(skip_to_itr))
        # Create LR schedule to update parameters
        self.lr_schedule = lr_schedule.from_spec(
                self.config_ms.lr.schedule, self.config_ms.lr.initial, [self.optim], epoch_len=len(dl_train))

        # --- All nn.Modules are setup ---
        print('-' * 80)

        # create log dir and summary writer
        self.log_dir = Trainer.get_log_dir(log_dir_root, rel_paths, restorer)
        self.log_date = logdir_helpers.log_date_from_log_dir(self.log_dir)
        self.ckpt_dir = os.path.join(self.log_dir, CKPTS_DIR_NAME)
        print(f'Checkpoints will be saved to {self.ckpt_dir}')
        saver.set_out_dir(self.ckpt_dir)


        # Create summary writer
        sw = sw_cls(self.log_dir)
        self.summarizer = vis.summarizable_module.Summarizer(sw)
        net.register_summarizer(self.summarizer)
        self.blueprint.register_summarizer(self.summarizer)
        # superclass setup
        super(MultiscaleTrainer, self).__init__(dl_train, dl_val, [self.optim], net, sw,
                                                max_epochs=self.config_dl.max_epochs,
                                                log_config=log_config, saver=saver, skip_to_itr=skip_to_itr)
Exemple #27
0
trainers = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = RMSP_ALPHA,
                              momentum = 0.0,
                              epsilon = RMSP_EPSILON,
                              clip_norm = GRAD_NORM_CLIP,
                              device = device)

for i in range(PARALLEL_SIZE):
  trainer = Trainer(i,
                    global_network,
                    initial_learning_rate,
                    learning_rate_input,
                    grad_applier,
                    MAX_TIME_STEP,
                    device = device)
  trainers.append(trainer)

# prepare session
config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

init = tf.global_variables_initializer()
# init = tf.initialize_all_variables()
sess.run(init)

# summary for tensorboard
Exemple #28
0
def train():
    trainer = Trainer(flags.cfg)
    trainer.train()
Exemple #29
0
def train_separate(task, dataset='1k', iterations=1, batch_size=128):

    # get data for task
    data, metadata = gather(dataset, task)

    # build data format
    dformat = ['contexts', 'questions', 'answers']

    # create feeds
    trainfeed = DataFeed(dformat, data=data['train'])
    testfeed = DataFeed(dformat, data=data['test'])

    # instantiate model
    model = RelationNet(clen=metadata['clen'],
                        qlen=metadata['qlen'],
                        slen=metadata['slen'],
                        vocab_size=metadata['vocab_size'],
                        num_candidates=metadata['candidates']['vocab_size'])

    # info
    print(':: <task {}> [0/2] Info')
    print(':: \t memory size : {}, #candidates : {}'.format(
        metadata['clen'], metadata['candidates']['vocab_size']))

    # create visualizer
    vis = Visualizer()
    vis.attach_scalars(model)

    with tf.Session() as sess:
        # run for multiple initializations
        i, accuracy, model_params = 0, [0.], [None]
        while accuracy[-1] < 0.95 and i < iterations:
            # init session
            sess.run(tf.global_variables_initializer())

            # add graph to visualizer
            vis.attach_graph(sess.graph)

            # create trainer
            trainer = Trainer(sess,
                              model,
                              trainfeed,
                              testfeed,
                              batch_size=batch_size)

            print('\n:: <task {}> ({}) [1/1] Training'.format(task, i))
            # train
            acc = trainer.fit(epochs=1000000,
                              eval_interval=1,
                              mode=Trainer.TRAIN,
                              verbose=True,
                              lr=0.0002)

            print(':: \tAccuracy after training: ', acc)

            # next iteration
            i += 1
            # add accuracy to list
            accuracy.append(acc)
            model_params.append(sess.run(tf.trainable_variables()))
            print(acc)

        print(':: [x/x] End of training')
        print(':: Max accuracy :', max(accuracy))

        # return model and model params
        return model, sess.run(tf.trainable_variables())
Exemple #30
0
    vis.attach_scalars(model)
    #vis.attach_params() # histograms of trainable variables

    # gpu config
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        # init session
        sess.run(tf.global_variables_initializer())

        # add graph to visualizer
        vis.attach_graph(sess.graph)

        # init trainer
        trainer = Trainer(sess, model, datasrc, batch_size)

        # fit model
        trainer.fit(epochs=600,
                    mode=Trainer.TRAIN,
                    verbose=True,
                    visualizer=vis,
                    eval_interval=1,
                    early_stop=False)
        '''
        print('****************************************************************** PRETRAINING OVER ')
        for task_id in reversed(range(21)):
            datasrc.task_id = task_id
            loss, acc = trainer.evaluate()
            print('evaluation loss for task_id = {}\t\tloss = {}\t\t accuracy = {}'.format(task_id, loss, acc))