def test(): os.environ['CUDA_LAUNCH_BLOCKING'] = "1" parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", type=str, help="train dataset for train bert", default='./data/corpus_pre.txt') parser.add_argument("-t", "--valid_dataset", type=str, help="valid set for evaluate train set", default='./data/corpus_pre.txt') parser.add_argument("-v", "--vocab_path", type=str, help="built vocab model path with vocab", default='./data/vocab.test') parser.add_argument("-o", "--output_path", type=str, help="output/bert.model", default='./output') parser.add_argument("-w", "--num_workers", type=int, default=0, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=False, help="training with CUDA: true, or false") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=[0, 1, 2, 3], help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() set_seed(args) paths = Paths(args.output_path) print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", vocab.vocab_size) args.char_nums = vocab.vocab_size print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading Valid Dataset", args.valid_dataset) valid_dataset = BERTDataset(args.valid_dataset, vocab, on_memory=args.on_memory) \ if args.valid_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=False) # 训练语料按长度排好序的 valid_data_loader = DataLoader(valid_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=False) \ if valid_dataset is not None else None print("Load BERT model") bert = torch.load('./output/model_bert/bert_ep10.model') model = torch.load('./output/model_mlm/mlm_ep10.model') print("Creating BERT Trainer") trainer = BERTTrainer(bert, vocab.vocab_size, model, train_dataloader=train_data_loader, test_dataloader=valid_data_loader, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, args=args, path=paths) print("Training Start") trainer.evaluate_and_print(vocab)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--valid_dataset", required=True, type=str, help="valid set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="output/bert.model") parser.add_argument("-w", "--num_workers", type=int, default=0, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=[0, 1, 2, 3], help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") args = parser.parse_args() paths = Paths(args.output_path) print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", vocab.vocab_size) args.char_nums = vocab.vocab_size print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading Valid Dataset", args.valid_dataset) valid_dataset = BERTDataset(args.valid_dataset, vocab, on_memory=args.on_memory) \ if args.valid_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=False) valid_data_loader = DataLoader(valid_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=False) \ if valid_dataset is not None else None print("Building BERT model") bert = BERT(embed_dim=hp.embed_dim, hidden=hp.hidden, args=args) print("Creating BERT Trainer") trainer = BERTTrainer(bert, vocab.vocab_size, train_dataloader=train_data_loader, test_dataloader=valid_data_loader, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, args=args, path=paths) print("Training Start") trainer.train()
def train(): os.environ['CUDA_LAUNCH_BLOCKING'] = "1" parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--valid_dataset", required=True, type=str, help="valid set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-m", "--model_path", required=True, type=str, help="Path of exist mlm model") parser.add_argument("-w", "--num_workers", type=int, default=1, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=[0, 1, 2, 3], help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument('--mode', type=str, default='train', help="train or test") parser.add_argument('--seed', type=int, default=3431, help="random seed for initialization") args = parser.parse_args() set_seed(args) paths = Paths(args.output_path) mode = args.mode print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", vocab.vocab_size) args.char_nums = vocab.vocab_size print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, corpus_lines=args.corpus_lines, on_memory=args.on_memory, train=False) print("Loading Valid Dataset", args.valid_dataset) valid_dataset = BERTDataset(args.valid_dataset, vocab, on_memory=args.on_memory, train=False) \ if args.valid_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch),num_workers=args.num_workers, shuffle=True) valid_data_loader = DataLoader(valid_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=True) \ if valid_dataset is not None else None print("Load BERT model") # bert = BERT(embed_dim=hp.embed_dim, hidden=hp.hidden, args=args) bert = torch.load(args.model_path) print("Creating BERT Trainer") global_step = 0 trainer = BERTTrainer(bert, vocab.vocab_size, train_dataloader=train_data_loader, test_dataloader=valid_data_loader, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, args=args, global_step=global_step, path=paths) print("Training Start") if mode == 'train': trainer.train() if mode == 'eval': trainer.eval()