def main(): # 进行训练所需的数据处理 vocab_size_source, vocab_size_target = _pre.train_preprocess() # 创建模型及相关变量 transformer = nmt_model.get_model(vocab_size_source, vocab_size_target) # 开始训练 trainer.train(transformer)
def main(): # 进行训练所需的数据处理 vocab_size_source, vocab_size_target = _pre.train_preprocess() # 创建模型及相关变量 transformer = nmt_model.get_model(vocab_size_source, vocab_size_target) # 开始训练 trainer.train(transformer, validation_data=_config.validation_data, validation_split=1 - _config.train_size, validation_freq=_config.validation_freq)
def main(): transformer = nmt_model.get_model(2894, 1787) learning_rate = trainer.CustomSchedule(_config.d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) trackables = {'transformer': transformer, 'optimizer': optimizer} model_key = 'transformer' model_dir = _config.checkpoint_path output_dir = _config.checkpoint_path + '_avg_ckpts' if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) trainer.train(transformer , validation_data=_config.validate_from_txt , validation_split=1 - _config.train_size , validation_freq=_config.validation_freq) path = average_checkpoints(model_dir, output_dir, trackables, max_count=8, model_key=model_key) print(path)
def start_training(): """Starts the training asynchronously using the flask executor It runs the training based on the DSI_EXECUTE_ON environment variable and at the end, removes the future from the executor """ logging.getLogger(__name__).info("Training execution started...") # noinspection PyBroadException try: environment = execution_environment() if environment == DSI_EXECUTE_ON_LOCAL: if dvc_remote(): train(dvc_data_repo=dvc_remote(), dvc_ssh_user=ssh_username(), dvc_ssh_password=ssh_password()) else: train() elif environment == DSI_EXECUTE_ON_SSH: connection = SSHRemoteExecutor(host=ssh_host(), username=ssh_username(), password=ssh_password(), debug_mode=debug_mode() or flask_args.debug, port=ssh_port(), dvc_remote=dvc_remote()) connection.setup_prerequisites() connection.run_training() connection.save_model_locally() else: raise Exception("{0} has a unknown value '{1}'".format( DSI_EXECUTE_ON, environment)) logging.getLogger(__name__).info("Training execution ended!!!") except Exception as training_exc: # This exception is broad because we cannot forseen all possible exceptions in # the DS train code. # Also, since this train is beeing executed in a separed thread all exceptions # should be catched logging.getLogger(__name__).info( "Training execution raised an exception...") f = io.StringIO() traceback.print_exc(file=f) f.seek(0) logging.getLogger(__name__).error(f.read()) raise ValueError(training_exc)
def train_wrapper(model): # resume train resume_count = 1 if args.resume: model.load(args.pretrained_model) resume_count = args.resume_count # load data train_loader = DataLoader(dataset=SunspotData(args.train_data_paths, args), num_workers=args.num_work, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True) valid_loader = DataLoader(dataset=SunspotData(args.valid_data_paths, args), num_workers=0, batch_size=args.batch_size, shuffle=True, pin_memory=False, drop_last=True) train_summary = SummaryHelper(save_path=os.path.join( args.logs_dir, 'train'), comment='custom', flush_secs=20) test_summary = SummaryHelper(save_path=os.path.join(args.logs_dir, 'test'), comment='custom', flush_secs=20) eta = args.sampling_start_value for epoch in range(resume_count, args.max_epoch + 1): loss = [] model.train_mode() for itr, (imgs, names) in enumerate(train_loader): eta, real_input_flag = schedule_sampling(eta, epoch) real_input_flag = torch.from_numpy(real_input_flag) itr_loss = trainer.train(model, imgs, real_input_flag, args, epoch, itr) loss.append(itr_loss) train_loss_avg = np.mean(loss) train_summary.add_scalar('train/loss', train_loss_avg, global_step=epoch) if epoch % args.snapshot_interval == 0: model.save(epoch) if epoch % args.test_interval == 0: model.eval_mode() metrics = trainer.test(model, valid_loader, args, epoch, args.gen_frm_dir, args.is_sunspots) test_summary.add_scalars('test', metrics, global_step=epoch)
if os.path.exists("./weight/CNN/") == False: os.mkdir("./weight/CNN/") if FLAGS.train: model = cnn_model.Model(FLAGS.keep_prob, FLAGS.class_num) print(model) try: model.load_state_dict(torch.load("./weight/CNN/weight.pt")) print("\n***\nCheckpoint found\nModel Restored\n***\n") except: print("\n***\nNo Checkpoint found\nTraining from begining\n***\n") trainer = trainer.Trainer(FLAGS.data, model, FLAGS.bsize, FLAGS.lr, FLAGS.epoch) trainer.train() torch.save(model.state_dict(), "./weight/CNN/weight.pt") if FLAGS.predict: model = cnn_model.Model(FLAGS.keep_prob, FLAGS.class_num) print(model) try: model.load_state_dict(torch.load("./weight/CNN/weight.pt")) print("\n***\nCheckpoint found\nModel Restored\n***\n") except: print("\n***\nNo Checkpoint found\nPrediction Abort, train the model first.\n***\n") sys.exit() predictor = trainer.Predictor(FLAGS.data, model)
""" This file contains the script for executing the training in a remote machine """ if __name__ == '__main__': import argparse import os from services.infrastructure.logging import initialize_logging from model.trainer import train parser = argparse.ArgumentParser(description="Remote training script") parser.add_argument("--env", "-e", required=True, type=str, help="Environment folder/name") parser.add_argument("--debug", "-d", action="store_true", help="Enables debug mode") args = parser.parse_args() os.chdir(args.env) initialize_logging(path="training-remote.log", remote=True, debug=args.debug) train()
type=str, default=None, help="dvc remote repository name") parser.add_argument("--dvc_user", "-u", required=False, type=str, default=None, help="ssh user for the remote dvc repository") parser.add_argument("--dvc_password", "-p", required=False, type=str, default=None, help="ssh password for the remote dvc repository") parser.add_argument("--debug", "-d", required=False, default=True, help="Enables debug mode") args = parser.parse_args() os.chdir(args.env) initialize_logging(path="training-remote.log", remote=True, debug=args.debug) train(dvc_data_repo=args.dvc_remote, dvc_ssh_user=args.dvc_user, dvc_ssh_password=args.dvc_password)
def main(): # 配置命令行参数 parser = OptionParser(version='%prog V1.0') parser.add_option("-t", "--type", action="store", type="string", dest="type", default="translate", help="TYPE: train/eval/translate") if len(sys.argv) > 1 and sys.argv[1] not in ['-t']: print('Error:no option ' + sys.argv[1]) print(parser.format_option_help()) (options, args) = parser.parse_args() if options.type == 'train': # 加载句子 en, ch = _pre.load_sentences(_config.path_to_train_file, _config.num_sentences) # 预处理句子 en = _pre.preprocess_sentences_en(en, mode=_config.en_tokenize_type) ch = _pre.preprocess_sentences_ch(ch, mode=_config.ch_tokenize_type) # 生成及保存字典 tokenizer_en, vocab_size_en = _pre.create_tokenizer( sentences=en, mode=_config.en_tokenize_type, save_path=_config.en_bpe_tokenizer_path) tokenizer_ch, vocab_size_ch = _pre.create_tokenizer( sentences=ch, mode=_config.ch_tokenize_type, save_path=_config.ch_tokenizer_path) print('vocab_size_en:%d' % vocab_size_en) print('vocab_size_ch:%d' % vocab_size_ch) # 编码句子 tensor_en, max_sequence_length_en = _pre.encode_sentences( sentences=en, tokenizer=tokenizer_en, mode=_config.en_tokenize_type) tensor_ch, max_sequence_length_ch = _pre.encode_sentences( sentences=ch, tokenizer=tokenizer_ch, mode=_config.ch_tokenize_type) # 创建模型及相关变量 optimizer, train_loss, train_accuracy, transformer = network.get_model( vocab_size_en, vocab_size_ch) # 开始训练 trainer.train(tensor_en, tensor_ch, transformer, optimizer, train_loss, train_accuracy) elif options.type == 'eval' or options.type == 'translate': if_ckpt = _pre.check_point() # 检测是否有检查点 if if_ckpt: # 加载中英文字典 tokenizer_en, vocab_size_en = _pre.get_tokenizer( path=_config.en_bpe_tokenizer_path, mode=_config.en_tokenize_type) tokenizer_ch, vocab_size_ch = _pre.get_tokenizer( path=_config.ch_tokenizer_path, mode=_config.ch_tokenize_type) print('vocab_size_en:%d' % vocab_size_en) print('vocab_size_ch:%d' % vocab_size_ch) # 创建模型及相关变量 optimizer, _, _, transformer = network.get_model( vocab_size_en, vocab_size_ch) # 加载检查点 network.load_checkpoint(transformer, optimizer) if options.type == 'eval': # 评估模式 print('-' * 30) print('可选择评价指标: 1.bleu指标 0.退出程序') eval_mode = input('请输入选择指标的序号:') if eval_mode == '1': eval.calc_bleu(_config.path_to_eval_file, transformer, tokenizer_en, tokenizer_ch) elif eval_mode == '0': print('感谢您的体验!') else: print('请输入正确序号') elif options.type == 'translate': # 翻译模式 while True: print('-' * 30) print('输入0可退出程序') sentence = input('请输入要翻译的句子 :') if sentence == '0': break else: print( '翻译结果:', translator.translate(sentence, transformer, tokenizer_en, tokenizer_ch)) else: print('请先训练才可使用其它功能...') elif len(sys.argv) > 2: print('Error:no TYPE ' + sys.argv[2]) print(parser.format_option_help())
from model.conv_net import conv_net from model.trainer import neural_net_trainer, train from utils.image_handler import load_datasets kwargs = {} X, Yobj, Yemt = load_datasets(img_dir='Images') # Object labeller - man, woman or child net_obj = conv_net(input_shape=X[0].shape) model_obj = neural_net_trainer(net_obj, best_checkpoint_path='best_obj_model') train(model_obj, X, Yobj, save_path='obj_model.tfl') # Emotion labeller - happy, sad etc net_emt = conv_net(input_shape=X.shape) model_emt = neural_net_trainer(net_emt, best_checkpoint_path='best_emotion_model') train(model_emt, X, Yemt, save_path='emt_model.tfl')