def preprocess() -> argparse.Namespace: """ preprocess of training :return: config args """ print('preprocessing starts...\n') # ====== parse arguments ====== # args = parse_args() # ====== set random seed ====== # random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # ====== save path ====== # now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') args.save_path = os.path.join('./logs/', 'my_log-' + now_time) if not os.path.exists(args.save_path) and not args.debug: os.makedirs(args.save_path) # ====== fitlog init ====== # fitlog.commit(__file__) fitlog.debug(args.debug) fitlog.add_hyper(args) # ====== tb VisualLogger init ====== # args.visual_logger = VisualLogger( args.save_path) if not args.debug else None # ====== cuda enable ====== # os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) args.device = torch.device( 'cuda') if args.cuda and torch.cuda.is_available() else torch.device( 'cpu') # ====== others ====== # os.environ['TOKENIZERS_PARALLELISM'] = 'false' torch.set_num_threads(6) print(args, end='\n\n') return args
import torch as tc import torch.nn as nn import torch.nn.functional as F import pdb import fitlog from .transformer_sublayers import Encoder, Decoder from fastNLP.embeddings.bert_embedding import BertEmbedding from fastNLP.embeddings.static_embedding import StaticEmbedding fitlog.commit(__file__) class Model(nn.Module): def __init__(self, vocab, logger, num_layers=6, d_hid=1024, h=8, d_model=512, dropout=0.0): super().__init__() #self.b_embedding = BertEmbedding(vocab, model_dir_or_name='cn-wwm', requires_grad = False, layers='4,-2,-1') #self.b_emb_outer = nn.Linear(self.b_embedding.embed_size , d_model) self.s_embedding = StaticEmbedding(vocab, "cn-sgns-literature-word", requires_grad=True) self.s_emb_outer = nn.Linear(self.s_embedding.embed_size, d_model) #self.r_embedding = nn.Embedding(len(vocab) , d_model , padding_idx = vocab.to_index("<pad>")) #self.r_emb_outer = nn.Linear(d_model , d_model)
def train(): args = parse_args() if args.debug: fitlog.debug() args.save_model = False # ================= define ================= tokenizer = RobertaTokenizer.from_pretrained('roberta-base') word_mask_index = tokenizer.mask_token_id word_vocab_size = len(tokenizer) if get_local_rank() == 0: fitlog.set_log_dir(args.log_dir) fitlog.commit(__file__, fit_msg=args.name) fitlog.add_hyper_in_file(__file__) fitlog.add_hyper(args) # ================= load data ================= dist.init_process_group('nccl') init_logger_dist() n_proc = dist.get_world_size() bsz = args.batch_size // args.grad_accumulation // n_proc args.local_rank = get_local_rank() args.save_dir = os.path.join(args.save_dir, args.name) if args.save_model else None if args.save_dir is not None and os.path.exists(args.save_dir): raise RuntimeError('save_dir has already existed.') logger.info('save directory: {}'.format( 'None' if args.save_dir is None else args.save_dir)) devices = list(range(torch.cuda.device_count())) NUM_WORKERS = 4 ent_vocab, rel_vocab = load_ent_rel_vocabs() logger.info('# entities: {}'.format(len(ent_vocab))) logger.info('# relations: {}'.format(len(rel_vocab))) ent_freq = get_ent_freq() assert len(ent_vocab) == len(ent_freq), '{} {}'.format( len(ent_vocab), len(ent_freq)) ##### root = args.data_dir dirs = os.listdir(root) drop_files = [] for dir in dirs: path = os.path.join(root, dir) max_idx = 0 for file_name in os.listdir(path): if 'large' in file_name: continue max_idx = int(file_name) if int(file_name) > max_idx else max_idx drop_files.append(os.path.join(path, str(max_idx))) ##### file_list = [] for path, _, filenames in os.walk(args.data_dir): for filename in filenames: file = os.path.join(path, filename) if 'large' in file or file in drop_files: continue file_list.append(file) logger.info('used {} files in {}.'.format(len(file_list), args.data_dir)) if args.data_prop > 1: used_files = file_list[:int(args.data_prop)] else: used_files = file_list[:round(args.data_prop * len(file_list))] data = GraphOTFDataSet(used_files, n_proc, args.local_rank, word_mask_index, word_vocab_size, args.n_negs, ent_vocab, rel_vocab, ent_freq) dev_data = GraphDataSet(used_files[0], word_mask_index, word_vocab_size, args.n_negs, ent_vocab, rel_vocab, ent_freq) sampler = OTFDistributedSampler(used_files, n_proc, get_local_rank()) train_data_iter = TorchLoaderIter(dataset=data, batch_size=bsz, sampler=sampler, num_workers=NUM_WORKERS, collate_fn=data.collate_fn) dev_data_iter = TorchLoaderIter(dataset=dev_data, batch_size=bsz, sampler=RandomSampler(), num_workers=NUM_WORKERS, collate_fn=dev_data.collate_fn) if args.test_data is not None: test_data = FewRelDevDataSet(path=args.test_data, label_vocab=rel_vocab, ent_vocab=ent_vocab) test_data_iter = TorchLoaderIter(dataset=test_data, batch_size=32, sampler=RandomSampler(), num_workers=NUM_WORKERS, collate_fn=test_data.collate_fn) if args.local_rank == 0: print('full wiki files: {}'.format(len(file_list))) print('used wiki files: {}'.format(len(used_files))) print('# of trained samples: {}'.format(len(data) * n_proc)) print('# of trained entities: {}'.format(len(ent_vocab))) print('# of trained relations: {}'.format(len(rel_vocab))) # ================= prepare model ================= logger.info('model init') if args.rel_emb is not None: # load pretrained relation embeddings rel_emb = np.load(args.rel_emb) # add_embs = np.random.randn(3, rel_emb.shape[1]) # add <pad>, <mask>, <unk> # rel_emb = np.r_[add_embs, rel_emb] rel_emb = torch.from_numpy(rel_emb).float() assert rel_emb.shape[0] == len(rel_vocab), '{} {}'.format( rel_emb.shape[0], len(rel_vocab)) # assert rel_emb.shape[1] == args.rel_dim logger.info('loaded pretrained relation embeddings. dim: {}'.format( rel_emb.shape[1])) else: rel_emb = None if args.model_name is not None: logger.info('further pre-train.') config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) model = CoLAKE(config=config, num_ent=len(ent_vocab), num_rel=len(rel_vocab), ent_dim=args.ent_dim, rel_dim=args.rel_dim, ent_lr=args.ent_lr, ip_config=args.ip_config, rel_emb=None, emb_name=args.emb_name) states_dict = torch.load(args.model_name) model.load_state_dict(states_dict, strict=True) else: model = CoLAKE.from_pretrained( 'roberta-base', num_ent=len(ent_vocab), num_rel=len(rel_vocab), ent_lr=args.ent_lr, ip_config=args.ip_config, rel_emb=rel_emb, emb_name=args.emb_name, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'dist_{}'.format(args.local_rank)) model.extend_type_embedding(token_type=3) # if args.local_rank == 0: # for name, param in model.named_parameters(): # if param.requires_grad is True: # print('{}: {}'.format(name, param.shape)) # ================= train model ================= # lr=1e-4 for peak value, lr=5e-5 for initial value logger.info('trainer init') no_decay = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias', 'layer_norm.weight' ] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] word_acc = WordMLMAccuracy(pred='word_pred', target='masked_lm_labels', seq_len='word_seq_len') ent_acc = EntityMLMAccuracy(pred='entity_pred', target='ent_masked_lm_labels', seq_len='ent_seq_len') rel_acc = RelationMLMAccuracy(pred='relation_pred', target='rel_masked_lm_labels', seq_len='rel_seq_len') metrics = [word_acc, ent_acc, rel_acc] if args.test_data is not None: test_metric = [rel_acc] tester = Tester(data=test_data_iter, model=model, metrics=test_metric, device=list(range(torch.cuda.device_count()))) # tester.test() else: tester = None optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) # warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') fitlog_callback = MyFitlogCallback(tester=tester, log_loss_every=100, verbose=1) gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') emb_callback = EmbUpdateCallback(model.ent_embeddings) all_callbacks = [gradient_clip_callback, emb_callback] if args.save_dir is None: master_callbacks = [fitlog_callback] else: save_callback = SaveModelCallback(args.save_dir, model.ent_embeddings, only_params=True) master_callbacks = [fitlog_callback, save_callback] if args.do_test: states_dict = torch.load(os.path.join(args.save_dir, args.model_name)).state_dict() model.load_state_dict(states_dict) data_iter = TorchLoaderIter(dataset=data, batch_size=args.batch_size, sampler=RandomSampler(), num_workers=NUM_WORKERS, collate_fn=data.collate_fn) tester = Tester(data=data_iter, model=model, metrics=metrics, device=devices) tester.test() else: trainer = DistTrainer(train_data=train_data_iter, dev_data=dev_data_iter, model=model, optimizer=optimizer, loss=LossInForward(), batch_size_per_gpu=bsz, update_every=args.grad_accumulation, n_epochs=args.epoch, metrics=metrics, callbacks_master=master_callbacks, callbacks_all=all_callbacks, validate_every=5000, use_tqdm=True, fp16='O1' if args.fp16 else '') trainer.train(load_best_model=False)
import fitlog fitlog.commit(__file__) # auto commit your codes fitlog.add_hyper_in_file(__file__) # record your hyperparameters """ Your training code here, you may use these functions to log your result: fitlog.add_hyper() fitlog.add_loss() fitlog.add_metric() fitlog.add_best_metric() ...... """ fitlog.finish() # finish the logging
parser.add_argument('--embed_dropout', default=0.5) parser.add_argument('--gaz_dropout', default=-1) parser.add_argument('--output_dropout', default=0.5) parser.add_argument('--epoch', default=100) parser.add_argument('--seed', default=100) args = parser.parse_args() set_seed(args.seed) fit_msg_list = [args.model, 'bi' if args.bi else 'uni', str(args.batch)] if args.model == 'lattice': fit_msg_list.append(str(args.skip_before_head)) fit_msg = ' '.join(fit_msg_list) fitlog.commit(__file__, fit_msg=fit_msg) device = torch.device(args.device) for k, v in args.__dict__.items(): print(k, v) refresh_data = False from pathes import * # ontonote4ner_cn_path = 0 # yangjie_rich_pretrain_unigram_path = 0 # yangjie_rich_pretrain_bigram_path = 0 # resume_ner_path = 0 # weibo_ner_path = 0 if args.dataset == 'ontonote':
def main(): args = parse_args() if args.debug: fitlog.debug() fitlog.set_log_dir(args.log_dir) fitlog.commit(__file__) fitlog.add_hyper_in_file(__file__) fitlog.add_hyper(args) if args.gpu != 'all': os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data( data_dir=args.data_dir) print('data directory: {}'.format(args.data_dir)) print('# of train samples: {}'.format(len(train_set))) print('# of dev samples: {}'.format(len(dev_set))) print('# of test samples: {}'.format(len(test_set))) ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../') # load entity embeddings ent_index = [] for k, v in temp_ent_vocab.items(): ent_index.append(ent_vocab[k]) ent_index = torch.tensor(ent_index) ent_emb = np.load(os.path.join(args.model_path, 'entities.npy')) ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb)) ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach() # load CoLAKE parameters config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) model = CoLAKEForRE(config, num_types=len(train_set.label_vocab), ent_emb=ent_emb) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) model.load_state_dict(states_dict, strict=False) print('parameters below are randomly initializecd:') for name, param in model.named_parameters(): if name not in states_dict: print(name) # tie relation classification head rel_index = [] for k, v in train_set.label_vocab.items(): rel_index.append(rel_vocab[k]) rel_index = torch.LongTensor(rel_index) rel_embeddings = nn.Embedding.from_pretrained( states_dict['rel_embeddings.weight']) rel_index = rel_index.cuda() rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze() model.tie_rel_weights(rel_cls_weight) model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight'] model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias'] model.rel_head.layer_norm.weight.data = states_dict[ 'rel_lm_head.layer_norm.weight'] model.rel_head.layer_norm.bias.data = states_dict[ 'rel_lm_head.layer_norm.bias'] model.resize_token_embeddings( len(RobertaTokenizer.from_pretrained('roberta-base')) + 4) print('parameters of CoLAKE has been loaded.') # fine-tune no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) metrics = [MacroMetric(pred='pred', target='target')] test_data_iter = TorchLoaderIter(dataset=test_set, batch_size=args.batch_size, sampler=RandomSampler(), num_workers=4, collate_fn=test_set.collate_fn) devices = list(range(torch.cuda.device_count())) tester = Tester(data=test_data_iter, model=model, metrics=metrics, device=devices) # tester.test() fitlog_callback = FitlogCallback(tester=tester, log_loss_every=100, verbose=1) gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') bsz = args.batch_size // args.grad_accumulation train_data_iter = TorchLoaderIter(dataset=train_set, batch_size=bsz, sampler=RandomSampler(), num_workers=4, collate_fn=train_set.collate_fn) dev_data_iter = TorchLoaderIter(dataset=dev_set, batch_size=bsz, sampler=RandomSampler(), num_workers=4, collate_fn=dev_set.collate_fn) trainer = Trainer( train_data=train_data_iter, dev_data=dev_data_iter, model=model, optimizer=optimizer, loss=LossInForward(), batch_size=bsz, update_every=args.grad_accumulation, n_epochs=args.epoch, metrics=metrics, callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback], device=devices, use_tqdm=True) trainer.train(load_best_model=False)
from fastNLP.core.losses import CrossEntropyLoss from fastNLP.core.metrics import AccuracyMetric from fastNLP import Const from fastNLP import AccuracyMetric from fastNLP import CrossEntropyLoss from fastNLP import BucketSampler from fastNLP import Batch import torch import time import fitlog from fastNLP.core.callback import FitlogCallback from fastNLP import Tester from fastNLP import Callback fitlog.commit('__file__') # auto commit your codes fitlog.add_hyper_in_file ('__file__') # record your hyperparameters loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) target_len = 20 def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet()