def prep_data(trees, X_vocab=None, y_vocab=None): update_vocab = False if X_vocab is None: X_vocab, y_vocab = Vocab(), Vocab() update_vocab = True X, y = [], [] for tree in tqdm(trees): if len(tree.tokens) < 2: continue #TODO accumulate features without iterating over all states try: for state, decision in tree.iter_oracle_states(): feats = state.extract_features() if update_vocab: X_vocab.add_words(feats) y_vocab.add_word(decision) X.append([X_vocab.encode(f) for f in feats]) y.append(y_vocab.encode(decision)) except: pass return X, y, X_vocab, y_vocab
def pre_trained(config): vocab = Vocab(config) vocab.add_words() vocab.build_bert_vocab() train = vocab.get_pre_trained_examples() print("train nums:{}".format(len(train))) # 3)使用DistributedSampler train_dataset = BuildDataSet(train) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_load = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn, sampler=train_sampler) #load source bert weights model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="../user_data/bert_source/{}_config.json" .format(config.model_name)) model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) model = BertForMaskedLM(config=model_config) # if os.path.isfile('../user_data/save_bert/bert_checkpoint.pth.tar'): # exist_checkpoint = torch.load('../user_data/save_bert/{}_checkpoint.pth.tar'.format(config.model_name),map_location=torch.device('cpu')) # exit_status,exit_epoch = exist_checkpoint["status"],exist_checkpoint["epoch"] # model = BertForMaskedLM(config=model_config) # model.load_state_dict(exit_status) # del exit_status # print("*********load chechpoin file********") # else: # model = BertForMaskedLM(config=model_config) # # status = torch.load('../user_data/bert_source/{}/pytorch_model.bin'.format(config.model_name),map_location=torch.device('cpu')) # # del_ls=['bert.embeddings.word_embeddings.weight','cls.predictions.bias','cls.predictions.decoder.weight','cls.predictions.decoder.bias'] # # for col in del_ls: # # if col in status: # # del status[col] # # model.load_state_dict(status,strict=False) # exit_epoch = 0 # print("*********load {}_bert source file********".format(config.model_name)) # del status for param in model.parameters(): param.requires_grad = True # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) torch.cuda.empty_cache() for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss #同步各个进程的速度,计算分布式loss torch.distributed.barrier() if torch.cuda.device_count() > 1: reduced_loss = reduce_mean(loss, config.nprocs) else: reduced_loss = loss model.zero_grad() loss.backward() optimizer.step() if config.local_rank in [0, -1]: now = strftime("%Y-%m-%d %H:%M:%S", localtime()) print("time:{},epoch:{}/{},mlm_reduce_loss:{}".format( now, epoch + 1, config.num_train_epochs, reduced_loss.item())) if torch.cuda.device_count() > 1: checkpoint = { "status": model.module.state_dict(), "epoch": epoch + 1 } else: checkpoint = {"status": model.state_dict(), "epoch": epoch + 1} torch.save( checkpoint, '../user_data/save_bert/{}_checkpoint.pth.tar'.format( config.model_name)) del checkpoint
def pre_trained(config): vocab = Vocab(config) vocab.add_words() vocab.build_bert_vocab() train = vocab.get_pre_trained_examples() # 3)使用DistributedSampler train_dataset = BuildDataSet(train) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_load = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn, sampler=train_sampler) #load source bert weights model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="bert_source/bert_config.json") model = BertForMaskedLM.from_pretrained( pretrained_model_name_or_path="bert_source", config=model_config) # model_config = BertConfig() # model = BertForMaskedLM(config=model_config) # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss #同步各个进程的速度,计算分布式loss torch.distributed.barrier() reduced_loss = reduce_mean(loss, config.nprocs) model.zero_grad() loss.backward() optimizer.step() if config.local_rank in [0, -1]: now = strftime("%Y-%m-%d %H:%M:%S", localtime()) print("time:{},epoch:{}/{},mlm_reduce_loss:{},loss:{}".format( now, epoch + 1, config.num_train_epochs, reduced_loss.item(), loss.item())) torch.save(model.module.state_dict(), 'save_bert' + os.sep + 'checkpoint.pth.tar')