def main(): args = get_train_args() model = init_train_env(args, tbert_type='siamese2') valid_examples = load_examples(args.data_dir, data_type="valid", model=model, num_limit=args.valid_num, overwrite=args.overwrite) train_examples = load_examples(args.data_dir, data_type="train", model=model, num_limit=args.train_num, overwrite=args.overwrite) train(args, train_examples, valid_examples, model, train_iter_method=train_with_neg_sampling) logger.info("Training finished")
def main(): args = get_train_args() model = init_train_env(args, tbert_type='siamese2') train_dir = os.path.join(args.data_dir, "train") valid_dir = os.path.join(args.data_dir, "valid") train_examples = load_examples(train_dir, model=model, num_limit=args.train_num) valid_examples = load_examples(valid_dir, model=model, num_limit=args.valid_num) train(args, train_examples, valid_examples, model, train_with_neg_sampling) logger.info("Training finished")
def main(): args = get_train_args() model = init_train_env(args, tbert_type='single') valid_examples = load_examples(args.data_dir, data_type="valid", model=model, num_limit=args.valid_num, overwrite=args.overwrite) train_examples = load_examples(args.data_dir, data_type="train", model=model, num_limit=args.train_num, overwrite=args.overwrite) train(args, train_examples, valid_examples, model, train_single_iteration) logger.info("Training finished")
def main(): args = get_rnn_train_args() if args.is_no_padding: args.gradient_accumulation_steps = 1 args.per_gpu_eval_batch_size = 1 args.per_gpu_train_batch_size = 1 args.logging_steps = args.logging_steps * 10 args.exp_name = "{}_{}".format( args.exp_name, datetime.datetime.now().strftime("%m-%d-%H-%M-%S")) # embd_info = create_emb_layer("./we/glove.6B.300d.txt") embd_info = load_embd_from_file(args.embd_file_path) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model = RNNTracer(hidden_dim=args.hidden_dim, embd_info=embd_info, embd_trainable=args.is_embd_trainable, max_seq_len=args.max_seq_len, is_no_padding=args.is_no_padding, rnn_type=args.rnn_type) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) model.device = args.device if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) train_dir = os.path.join(args.data_dir, "train") valid_dir = os.path.join(args.data_dir, "valid") train_examples = load_examples_for_rnn(train_dir, model=model, num_limit=args.train_num) valid_examples = load_examples_for_rnn(valid_dir, model=model, num_limit=args.valid_num) logger.info("Training started") train(args, train_examples, valid_examples, model, train_rnn_iter) logger.info("Training finished")