if OPTS.train_sgd_steps > 0: tb_str += "_imit{}".format(OPTS.train_sgd_steps) tb_logdir = os.path.join(HOME_DIR, "tensorboard", "ebm", "{}_cassio".format(OPTS.dtok), tb_str) for logdir in [tb_logdir + "_train", tb_logdir + "_dev"]: os.makedirs(logdir, exist_ok=True) else: tb_logdir = os.path.join(OPTS.root, "tensorboard") if not os.path.exists(tb_logdir): os.mkdir(tb_logdir) # Get the path variables (train_src_corpus, train_tgt_corpus, distilled_tgt_corpus, truncate_datapoints, test_src_corpus, test_tgt_corpus, ref_path, src_vocab_path, tgt_vocab_path, n_valid_per_epoch, training_warmsteps, training_maxsteps, pretrained_autoregressive_path) = get_dataset_paths(OPTS.root, OPTS.dtok) if OPTS.longertrain: training_maxsteps = int(training_maxsteps * 1.5) if OPTS.x3longertrain: training_maxsteps = int(training_maxsteps * 3) if nmtlab.__version__ < "0.7.0": print("lanmt now requires nmtlab >= 0.7.0") print("Update by pip install -U nmtlab") sys.exit() if OPTS.fp16: print("fp16 option is not ready") sys.exit() # Define dataset
# Get the path variables ( train_src_corpus, train_tgt_corpus, distilled_tgt_corpus, truncate_datapoints, test_src_corpus, test_tgt_corpus, ref_path, src_vocab_path, tgt_vocab_path, n_valid_per_epoch, training_warmsteps, training_maxsteps, pretrained_autoregressive_path ) = get_dataset_paths(OPTS.root, OPTS.dtok) if OPTS.longertrain: training_maxsteps = int(training_maxsteps * 1.5) if OPTS.x3longertrain: training_maxsteps = int(training_maxsteps * 3) if nmtlab.__version__ < "0.7.0": print("lanmt now requires nmtlab >= 0.7.0") print("Update by pip install -U nmtlab") sys.exit() # Define dataset if OPTS.distill: tgt_corpus = distilled_tgt_corpus else:
torch.cuda.set_device(hvd.local_rank()) part_index = hvd.rank() part_num = hvd.size() gpu_num = hvd.size() else: part_index = 0 part_num = 1 gpu_num = 1 if is_root_node(): print("Running on {} GPUs".format(gpu_num)) # Get the path variables (train_src_corpus, train_tgt_corpus, distilled_tgt_corpus, truncate_datapoints, test_src_corpus, test_tgt_corpus, ref_path, src_vocab_path, tgt_vocab_path, n_valid_per_epoch, training_warmsteps, training_maxsteps, pretrained_autoregressive_path) = get_dataset_paths(DATA_ROOT, OPTS.dtok) if OPTS.longertrain: training_maxsteps = int(training_maxsteps * 1.5) if nmtlab.__version__ < "0.7.0": print("lanmt now requires nmtlab >= 0.7.0") print("Update by pip install -U nmtlab") sys.exit() if OPTS.fp16: print("fp16 option is not ready") sys.exit() # Define dataset if OPTS.distill: tgt_corpus = distilled_tgt_corpus
ap.add_argument("--opt_hiddensz", type=int, default=256) ap.add_argument("--opt_without_source", action="store_true") ap.add_argument("--opt_codebits", type=int, default=0) ap.add_argument("--opt_limit_tree_depth", type=int, default=0) ap.add_argument("--opt_limit_datapoints", type=int, default=-1) ap.add_argument("--opt_load_pretrain", action="store_true") ap.add_argument("--model_path", default="{}/tree2code.pt".format(DATA_ROOT)) ap.add_argument("--result_path", default="{}/tree2code.result".format(DATA_ROOT)) OPTS.parse(ap) n_valid_per_epoch = 4 # Define datasets DATA_ROOT = "./mydata" dataset_paths = get_dataset_paths(DATA_ROOT, OPTS.dtok) # Using horovod for training, automatically occupy all GPUs # Determine the local rank horovod_installed = importlib.util.find_spec("horovod") is not None if torch.cuda.is_available() and horovod_installed: import horovod.torch as hvd hvd.init() torch.cuda.set_device(hvd.local_rank()) part_index = hvd.rank() part_num = hvd.size() gpu_num = hvd.size() else: part_index = 0 part_num = 1 gpu_num = 1