def main(args): # fix_seed() 不要fix效果才好.!!!!!!!!!!!!!!否则shuffle没用了 if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) # list(train_data) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab # 这里面是生成字典的算法 if args.vocab_path: old_vocab = Vocabulary.from_files(args.vocab_path) # 代码修改成,不管传入不传入都根据数据集重新简历字典,然后进行2个字典的合并. if 1: # 生成字典. 利用数据集生成字典!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # 下面看看这个生成vocab 如何做的. 生成之后的东西就是对应到目录下的output_vocabulary里面的内容. # 直接调用的是allennlp库包,还是需要看懂里面实现的算法. #-------------需要对这个from_instances进行修改,看里面如何生成我的更大字典.这个是已经封装好的了.所以不用看了.改上面数据才行. new_vocab = Vocabulary.from_instances(train_data, max_vocab_size={ 'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2 }, tokens_to_add=tokens_to_add) from allennlp.common.params import Params params = Params({"non_padded_namespaces": set(namespaces)}) vocab = old_vocab old_vocab.extend_from_instances(params, train_data) old_vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) from pathlib import Path vocabdir = Path(__file__).resolve().parent.parent / os.path.join( args.model_dir, 'vocabulary', 'labels.txt') print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: # 我们不用这个地方来加载 model.load_state_dict( torch.load( os.path.join(args.pretrain_folder, args.pretrain + '.th'))) model = model.to(device) print("Model is set", '模型加载完毕') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator( batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=True, # 吧这个地方改了.true accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None) print("Start training") trainer.train(args.oldmodel) # Here's how to save the model. # 最优模型再存一遍.所以最后这个目录里面只存model.th即可.而不用管那些带系数的. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped", "训练全部结束,model存在了", args.model_dir + ' / model.th')
def main(args): fix_seed() if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab if args.vocab_path: vocab = Vocabulary.from_files(args.vocab_path) else: vocab = Vocabulary.from_instances(train_data, max_vocab_size={ 'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2 }, tokens_to_add=tokens_to_add) vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: model.load_state_dict( torch.load(os.path.join(args.pretrain_folder, args.pretrain + '.th'), map_location=torch.device('cpu'))) model = model.to(device) print("Model is set") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator( batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=False, accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None) print("Start training") trainer.train() # Here's how to save the model. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped")
def main(args): fix_seed() if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab if args.vocab_path: vocab = Vocabulary.from_files(args.vocab_path) else: vocab = Vocabulary.from_instances(train_data, max_vocab_size={ 'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2 }, tokens_to_add=tokens_to_add) vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) # model = GecBERTModel(vocab_path=args.vocab_path, # model_paths=args.model_path, # max_len=args.max_len, min_len=args.min_len, # iterations=args.iteration_count, # min_error_probability=args.min_error_probability, # min_probability=args.min_error_probability, # lowercase_tokens=args.lowercase_tokens, # model_name=args.transformer_model, # special_tokens_fix=args.special_tokens_fix, # log=False, # confidence=args.additional_confidence, # is_ensemble=args.is_ensemble, # weigths=args.weights) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: model.load_state_dict( torch.load( os.path.join(args.pretrain_folder, args.pretrain + '.th'))) print('cuda_device:', cuda_device) #exit(0) model = model.to(device) print("Model is set") # print('model:', model) def print_size_of_model(model): torch.save(model.state_dict(), "temp.p") print('Size (MB):', os.path.getsize("temp.p") / 1e6) os.remove('temp.p') if args.keep != 12: prev_model = GecBERTModel( vocab_path=args.vocab_path, model_paths=args.model_path, max_len=args.max_len, min_len=args.min_len, iterations=args.iteration_count, min_error_probability=args.min_error_probability, min_probability=args.min_error_probability, lowercase_tokens=args.lowercase_tokens, model_name=args.transformer_model, special_tokens_fix=args.special_tokens_fix, log=False, confidence=args.additional_confidence, is_ensemble=args.is_ensemble, weigths=args.weights, num_layers_to_keep=args.keep) # print('prev_model:', prev_model.models) # print(model) print_size_of_model(model) print_size_of_model(prev_model.models[0]) model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer = \ prev_model.models[0].text_field_embedder.token_embedder_bert.bert_model.encoder.layer print_size_of_model(model) # exit(0) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator( batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=False, accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None) GPUtil.showUtilization() print("Start training") trainer.train() # Here's how to save the model. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped")