def test(model, arges): preds = [] labels = [] imp_indexes = [] metrics = ['group_auc'] test_file = os.path.join(args.data_dir, args.test_data_file) preds = [] labels = [] imp_indexes = [] feature_file = os.path.join(args.data_dir, args.feature_file) iterator = NewsIterator(batch_size=900, npratio=-1, feature_file=feature_file, field=args.field) print('test...') with torch.no_grad(): data_batch = iterator.load_data_from_file(test_file) batch_t = 0 for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += len(candidate_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) logit = model(his_id, candidate_id, None, mode='validation') # print('???',label_t,label) # assert 1==0 logit = list(np.reshape(np.array(logit.cpu()), -1)) label = list(np.reshape(np.array(label), -1)) imp_index = list(np.reshape(np.array(imp_index), -1)) labels.extend(label) preds.extend(logit) imp_indexes.extend(imp_index) print('all data: ', len(labels)) group_labels, group_preds = group_labels_func(labels, preds, imp_indexes) res = cal_metric(group_labels, group_preds, metrics) return res['group_auc']
def train(cudaid, args,model): dist.init_process_group( backend='nccl', init_method='env://', world_size=args.size, rank=cudaid) random.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) print('params: '," T_warm: ",T_warm," all_iteration: ",all_iteration," lr: ",lr) #cuda_list=range(args.size) print('rank: ',cudaid) torch.cuda.set_device(cudaid) model.cuda(cudaid) accumulation_steps=int(args.batch_size/args.size/args.gpu_size) optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0,max_grad_norm=1.0) model, optimizer = amp.initialize(model, optimizer, opt_level='O2') model = DDP(model) #model = nn.DataParallel(model, device_ids=cuda_list) # torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1) # torch.cuda.set_device(cudaid) #model, optimizer = amp.initialize(model, optimizer, opt_level="O1") #model=torch.nn.parallel.DistributedDataParallel(model, device_ids=cuda_list) #model = torch.nn.DataParallel(model) #model=apex.parallel.DistributedDataParallel(model) accum_batch_loss=0 iterator=NewsIterator(batch_size=args.gpu_size, npratio=4,feature_file=os.path.join(args.data_dir,args.feature_file),field=args.field) train_file=os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t=0 iteration=0 print('train...',args.field) #w=open(os.path.join(args.data_dir,args.log_file),'w') if cudaid==0: writer = SummaryWriter(os.path.join(args.data_dir, args.log_file) ) epoch=0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t=0 iteration=0 step=0 best_score=-1 #w=open(os.path.join(args.data_dir,args.log_file),'w') # model.eval() # auc=test(model,args) for epoch in range(0,10): #while True: all_loss=0 all_batch=0 data_batch=iterator.load_data_from_file(train_file,cudaid,args.size) for imp_index , user_index, his_id, candidate_id , label in data_batch: batch_t+=1 assert candidate_id.shape[1]==2 his_id=his_id.cuda(cudaid) candidate_id= candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss=model(his_id,candidate_id, label) sample_size=candidate_id.shape[0] loss=loss.sum()/sample_size/math.log(2) accum_batch_loss+=float(loss) all_loss+=float(loss) all_batch+=1 loss = loss/accumulation_steps #loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (batch_t)%accumulation_steps==0: iteration+=1 adjust_learning_rate(optimizer,iteration) optimizer.step() optimizer.zero_grad() if cudaid==0: print(' batch_t: ',batch_t, ' iteration: ', iteration, ' epoch: ',epoch,' accum_batch_loss: ',accum_batch_loss/accumulation_steps,' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss/accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss=0 if iteration%2==0 and cudaid==0: torch.cuda.empty_cache() model.eval() if cudaid==0: auc=test(model,args) print(auc) writer.add_scalar('auc/valid', auc, step) step+=1 if auc>best_score: torch.save(model.state_dict(), os.path.join(args.save_dir,'Plain_robert_dot_best.pkl')) best_score=auc print('best score: ',best_score) torch.cuda.empty_cache() model.train() if cudaid==0: torch.save(model.state_dict(), os.path.join(args.save_dir,'Plain_robert_dot'+str(epoch)+'.pkl'))
def train(model, optimizer, args): print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #writer = SummaryWriter('./model_snapshot_error') # cuda_list=range(cuda_num) cuda_list = range(args.size) #model.cuda(cudaid) # accumulation_steps=40 accumulation_steps = int(args.batch_size / args.size / 8) #accumulation_steps=1 model = nn.DataParallel(model, device_ids=cuda_list) accum_batch_loss = 0 #train_file='train_ms_roberta_plain_pair_sample_shuffle.txt' #train_file='train_ms_roberta_plain_pair_sample_large_new_shuffle.txt' #train_file='train_ms_roberta.txt' iterator = NewsIterator(batch_size=8 * args.size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file)) train_file = os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t = 0 iteration = 0 print('train...', cuda_list) writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) #w=open(os.path.join(args.data_dir,args.log_file),'w') epoch = 0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t = 0 iteration = 0 #w=open(os.path.join(args.data_dir,args.log_file),'w') for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file) for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 # if batch_t<=232240: # if batch_t<=317190: # if (batch_t)%accumulation_steps==0: # iteration+=1 # continue # print('shape: ',his_id.shape,candidate_id.shape,label.shape) # print('candidate_id: ',candidate_id) assert candidate_id.shape[1] == 2 his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss, sample_size = model(his_id, candidate_id, label) sample_size = float(sample_size.sum()) loss = loss.sum() / sample_size / math.log(2) # sample_size=float(sample_size) # loss=loss/sample_size/math.log(2) #print(' batch_t: ',batch_t, ' epoch: ',epoch,' loss: ',float(loss)) #print('???loss',loss) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: #print('candidate_id: ',candidate_id) # total_norm=0 # for p in model.parameters(): # if p.grad==None: # print('error: ',index,p.size(),p.grad) # param_norm = p.grad.data.norm(2) # total_norm += param_norm.item() ** 2 # total_norm = total_norm ** (1. / 2) #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # total_clip_norm=0 # for p in model.parameters(): # if p.grad==None: # print('error: ',index,p.size(),p.grad) # param_norm = p.grad.data.norm(2) # total_clip_norm += param_norm.item() ** 2 # total_clip_norm = total_clip_norm ** (1. / 2) iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) #w.write(' batch_t: '+str(batch_t)+' iteration: '+str(iteration)+' epoch: '+str(epoch)+' accum_batch_loss: '+str(accum_batch_loss/accumulation_steps)+'\n') writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) accum_batch_loss = 0 #assert epoch>=3 # torch.save(model.state_dict(),'./model/Plain_bert_960b_large'+str(epoch)+'.pkl') #writer.add_scalar('Loss/train', float(accum_batch_loss/accumulation_steps), iteration) #break torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def train(model, optimizer, args): print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) cuda_list = range(args.size) accumulation_steps = int(args.batch_size / args.size / args.gpu_size) #model = nn.DataParallel(model, device_ids=cuda_list) # torch.cuda.set_device(cudaid) # torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1) # model=torch.nn.parallel.DistributedDataParallel(model, device_ids=cuda_list,output_device=0,find_unused_parameters=True) model = torch.nn.DataParallel(model) accum_batch_loss = 0 iterator = NewsIterator(batch_size=args.gpu_size * args.size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), field=args.field) train_file = os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t = 0 iteration = 0 print('train...', cuda_list) #w=open(os.path.join(args.data_dir,args.log_file),'w') writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) epoch = 0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t = 0 iteration = 0 step = 0 best_score = -1 #w=open(os.path.join(args.data_dir,args.log_file),'w') # model.eval() # auc=test(model,args) for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file) for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss = model(his_id, candidate_id, label) sample_size = candidate_id.shape[0] loss = loss.sum() / sample_size / math.log(2) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss = 0 if iteration % 2 == 0: torch.cuda.empty_cache() model.eval() auc = test(model, args) print(auc) if auc > best_score: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot_best.pkl')) best_score = auc print('best score: ', best_score) writer.add_scalar('auc/valid', auc, step) step += 1 torch.cuda.empty_cache() model.train() torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def train(cudaid, args, model): pynvml.nvmlInit() dist.init_process_group(backend='nccl', init_method='env://', world_size=args.size, rank=cudaid) random.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #cuda_list=range(args.size) print('rank: ', cudaid) torch.cuda.set_device(cudaid) model.cuda(cudaid) accumulation_steps = int(args.batch_size / args.size / args.gpu_size) #optimizer = torch.optim.Adam(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0) optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.0, max_grad_norm=1.0) model, optimizer = amp.initialize(model, optimizer, opt_level='O2') model = DDP(model) accum_batch_loss = 0 history_file = os.path.join(args.data_dir, args.history_file) if 'last' in args.field: abs_file = os.path.join(args.data_dir, args.abs_file) else: abs_file = '' iterator = NewsIterator(batch_size=args.gpu_size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), history_file=history_file, abs_file=abs_file, field=args.field, fp16=True) train_file = os.path.join(args.data_dir, args.data_file) print('train...', args.field) if cudaid == 0: writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) model.train() #epoch=args.epoch iteration = args.iteration batch_t = args.batch_t step = int(iteration / 500) + 1 best_score = args.best_score start_pos = None #args.batch_t*args.gpu_size#如果不是0的话千万记得加一个% for epoch in range(args.epoch, 12): all_loss = 0 all_batch = 0 if epoch != args.epoch: data_batch = iterator.load_data_from_file(train_file, cudaid, args.size) else: data_batch = iterator.load_data_from_file(train_file, cudaid, args.size, start_pos) print('load ok...') for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 # if cudaid==1: # torch.set_printoptions(profile="full") # print(his_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss = model(his_id, candidate_id, label) sample_size = candidate_id.shape[0] loss = loss.sum() / sample_size / math.log(2) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 # if cudaid==1: # torch.set_printoptions(profile="full") # w=open('input.txt','w') # w.write(str(his_id.cpu())) # w.close() # assert 1==0 loss = loss / accumulation_steps #loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() if cudaid == 0: # handle = pynvml.nvmlDeviceGetHandleByIndex(cudaid) # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) # #print(int(meminfo.used)/1024/1024) # print('loss: ',loss,int(meminfo.used)/1024/1024) print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss = 0 if iteration % 500 == 0 and cudaid == 0: torch.cuda.empty_cache() model.eval() if cudaid == 0: auc = test(model, args) print(auc) writer.add_scalar('auc/valid', auc, step) step += 1 if auc > best_score: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot_best.pkl')) best_score = auc print('best score: ', best_score) torch.cuda.empty_cache() model.train() if cudaid == 0: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def train(model, optimizer, args): print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #writer = SummaryWriter('./model_snapshot_error') # cuda_list=range(cuda_num) cuda_list = range(args.size) #model.cuda(cudaid) # accumulation_steps=40 accumulation_steps = int(args.batch_size / args.size / 8) #accumulation_steps=1 #model = nn.DataParallel(model, device_ids=cuda_list) accum_batch_loss = 0 iterator = NewsIterator(batch_size=8 * args.size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), field=args.field) train_file = os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t = 0 iteration = 0 print('train...', cuda_list) #w=open(os.path.join(args.data_dir,args.log_file),'w') writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) epoch = 0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t = 0 iteration = 0 #w=open(os.path.join(args.data_dir,args.log_file),'w') for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file) for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss, sample_size = model(his_id, candidate_id, label) sample_size = float(sample_size.sum()) loss = loss.sum() / sample_size / math.log(2) # sample_size=float(sample_size) # loss=loss/sample_size/math.log(2) #print(' batch_t: ',batch_t, ' epoch: ',epoch,' loss: ',float(loss)) #print('???loss',loss) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) #w.write(' batch_t: '+str(batch_t)+' iteration: '+str(iteration)+' epoch: '+str(epoch)+' accum_batch_loss: '+str(accum_batch_loss/accumulation_steps)+'\n') writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) accum_batch_loss = 0 #assert epoch>=3 # torch.save(model.state_dict(),'./model/Plain_bert_960b_large'+str(epoch)+'.pkl') #writer.add_scalar('Loss/train', float(accum_batch_loss/accumulation_steps), iteration) #break torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))