def test(model, args): preds = [] labels = [] imp_indexes = [] metrics = ['group_auc'] test_file = os.path.join(args.data_dir, args.test_data_file) preds = [] labels = [] imp_indexes = [] feature_file = os.path.join(args.data_dir, args.feature_file) history_file = os.path.join(args.data_dir, args.history_file) if 'last' in args.field: abs_file = os.path.join(args.data_dir, args.abs_file) else: abs_file = '' iterator = NewsIterator(batch_size=1, npratio=-1, feature_file=feature_file, history_file=history_file, abs_file=abs_file, field=args.field, fp16=True) print('test...') cudaid = 0 #model = nn.DataParallel(model, device_ids=list(range(args.size))) step = 0 with torch.no_grad(): data_batch = iterator.load_test_data_from_file(test_file, None) batch_t = 0 for imp_index, user_index, his_id, candidate_id, label, _ in data_batch: batch_t += len(candidate_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) logit = model(his_id, candidate_id, None, mode='validation') # print('???',label_t,label) # assert 1==0 logit = list(np.reshape(np.array(logit.cpu()), -1)) label = list(np.reshape(np.array(label), -1)) imp_index = list(np.reshape(np.array(imp_index), -1)) assert len(imp_index) == 1 imp_index = imp_index * len(logit) assert len(logit) == len(label) assert len(logit) == len(imp_index) assert np.sum(np.array(label)) != 0 labels.extend(label) preds.extend(logit) imp_indexes.extend(imp_index) step += 1 if step % 100 == 0: print('all data: ', len(labels)) group_labels, group_preds = group_labels_func(labels, preds, imp_indexes) res = cal_metric(group_labels, group_preds, metrics) return res['group_auc']
def test(model, arges): preds = [] labels = [] imp_indexes = [] metrics = ['group_auc'] test_file = os.path.join(args.data_dir, args.test_data_file) preds = [] labels = [] imp_indexes = [] feature_file = os.path.join(args.data_dir, args.feature_file) iterator = NewsIterator(batch_size=900, npratio=-1, feature_file=feature_file, field=args.field) print('test...') with torch.no_grad(): data_batch = iterator.load_data_from_file(test_file) batch_t = 0 for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += len(candidate_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) logit = model(his_id, candidate_id, None, mode='validation') # print('???',label_t,label) # assert 1==0 logit = list(np.reshape(np.array(logit.cpu()), -1)) label = list(np.reshape(np.array(label), -1)) imp_index = list(np.reshape(np.array(imp_index), -1)) labels.extend(label) preds.extend(logit) imp_indexes.extend(imp_index) print('all data: ', len(labels)) group_labels, group_preds = group_labels_func(labels, preds, imp_indexes) res = cal_metric(group_labels, group_preds, metrics) return res['group_auc']
def test(cudaid,args,model):#valid #w=open('train_plain_bert.txt','w') #model=Plain_bert(load_model='roberta.base',output_size=768) #model.cuda(cudaid) dist.init_process_group( backend='nccl', init_method='env://', world_size=args.size, rank=cudaid) model.eval() test_file=os.path.join(args.data_dir, args.data_file) #cudaid=args.cudaid feature_file=os.path.join(args.data_dir,args.feature_file) history_file=os.path.join(args.data_dir,args.history_file) if 'last' in args.field: abs_file=os.path.join(args.data_dir,args.abs_file) else: abs_file='' w=open(os.path.join(args.data_dir,args.log_file+str(cudaid)),'w') #data_batch=utils.get_batch() #cuda_list=range(cuda_num) #model = nn.DataParallel(model, device_ids=cuda_list) preds = [] labels = [] imp_indexes = [] #test_file='valid_ms_roberta_plain.txt' #test_file='valid_ms_roberta_plain_large.txt' #test_file='valid_ms_roberta_plain.txt' feature_file=os.path.join(args.data_dir,args.feature_file) iterator=NewsIterator(batch_size=args.gpu_size, npratio=-1,feature_file=feature_file,history_file=history_file,abs_file=abs_file,field=args.field,fp16=True) print('test...') with torch.no_grad(): data_batch=iterator.load_test_data_from_file(test_file,args.can_length) #data_batch=iterator.load_data_from_file(test_file) batch_t=0 for imp_index , user_index, his_id, candidate_id , label,can_len in data_batch: batch_t+=len(candidate_id) his_id=his_id.cuda(cudaid) candidate_id= candidate_id.cuda(cudaid) logit=model.predict(his_id,candidate_id) logit=np.array(logit.cpu()) imp_index=np.reshape(np.array(imp_index), -1) assert len(imp_index)==len(logit) # logit=np.reshape(np.array(logit.cpu()), -1) # label=np.reshape(np.array(label), -1) # imp_index=np.reshape(np.array(imp_index), -1) #print('batch_t:',batch_t) for i in range(len(imp_index)): # w.write('imp_index:'+str(imp_index[i])+' '+' '.join([str(logit[i][j]) for j in range(can_len[i][0])])) # w.write('\n') for j in range(can_len[i][0]): #assert len(label[i])==can_len[i][0] # w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+' label: '+str(label[i][j])+'\n') # print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+' label: '+str(label[i][j])) w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+'\n') print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])) # print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i])) # w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i])+'\n') #assert 1==0 print('imp_index: ',imp_index[-1]) # preds.extend(logit) # labels.extend(label) # imp_indexes.extend(imp_index) # batch_t+=len(candidate_id) #print(labels) #if batch_t==10: #break # group_labels, group_preds = group_labels_func(labels, preds, imp_indexes) # res = cal_metric(group_labels, group_preds, metrics) #return res w.close()
def train(cudaid, args,model): dist.init_process_group( backend='nccl', init_method='env://', world_size=args.size, rank=cudaid) random.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) print('params: '," T_warm: ",T_warm," all_iteration: ",all_iteration," lr: ",lr) #cuda_list=range(args.size) print('rank: ',cudaid) torch.cuda.set_device(cudaid) model.cuda(cudaid) accumulation_steps=int(args.batch_size/args.size/args.gpu_size) optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0,max_grad_norm=1.0) model, optimizer = amp.initialize(model, optimizer, opt_level='O2') model = DDP(model) #model = nn.DataParallel(model, device_ids=cuda_list) # torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1) # torch.cuda.set_device(cudaid) #model, optimizer = amp.initialize(model, optimizer, opt_level="O1") #model=torch.nn.parallel.DistributedDataParallel(model, device_ids=cuda_list) #model = torch.nn.DataParallel(model) #model=apex.parallel.DistributedDataParallel(model) accum_batch_loss=0 iterator=NewsIterator(batch_size=args.gpu_size, npratio=4,feature_file=os.path.join(args.data_dir,args.feature_file),field=args.field) train_file=os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t=0 iteration=0 print('train...',args.field) #w=open(os.path.join(args.data_dir,args.log_file),'w') if cudaid==0: writer = SummaryWriter(os.path.join(args.data_dir, args.log_file) ) epoch=0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t=0 iteration=0 step=0 best_score=-1 #w=open(os.path.join(args.data_dir,args.log_file),'w') # model.eval() # auc=test(model,args) for epoch in range(0,10): #while True: all_loss=0 all_batch=0 data_batch=iterator.load_data_from_file(train_file,cudaid,args.size) for imp_index , user_index, his_id, candidate_id , label in data_batch: batch_t+=1 assert candidate_id.shape[1]==2 his_id=his_id.cuda(cudaid) candidate_id= candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss=model(his_id,candidate_id, label) sample_size=candidate_id.shape[0] loss=loss.sum()/sample_size/math.log(2) accum_batch_loss+=float(loss) all_loss+=float(loss) all_batch+=1 loss = loss/accumulation_steps #loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (batch_t)%accumulation_steps==0: iteration+=1 adjust_learning_rate(optimizer,iteration) optimizer.step() optimizer.zero_grad() if cudaid==0: print(' batch_t: ',batch_t, ' iteration: ', iteration, ' epoch: ',epoch,' accum_batch_loss: ',accum_batch_loss/accumulation_steps,' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss/accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss=0 if iteration%2==0 and cudaid==0: torch.cuda.empty_cache() model.eval() if cudaid==0: auc=test(model,args) print(auc) writer.add_scalar('auc/valid', auc, step) step+=1 if auc>best_score: torch.save(model.state_dict(), os.path.join(args.save_dir,'Plain_robert_dot_best.pkl')) best_score=auc print('best score: ',best_score) torch.cuda.empty_cache() model.train() if cudaid==0: torch.save(model.state_dict(), os.path.join(args.save_dir,'Plain_robert_dot'+str(epoch)+'.pkl'))
def train(model, optimizer, args): print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #writer = SummaryWriter('./model_snapshot_error') # cuda_list=range(cuda_num) cuda_list = range(args.size) #model.cuda(cudaid) # accumulation_steps=40 accumulation_steps = int(args.batch_size / args.size / 8) #accumulation_steps=1 model = nn.DataParallel(model, device_ids=cuda_list) accum_batch_loss = 0 #train_file='train_ms_roberta_plain_pair_sample_shuffle.txt' #train_file='train_ms_roberta_plain_pair_sample_large_new_shuffle.txt' #train_file='train_ms_roberta.txt' iterator = NewsIterator(batch_size=8 * args.size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file)) train_file = os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t = 0 iteration = 0 print('train...', cuda_list) writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) #w=open(os.path.join(args.data_dir,args.log_file),'w') epoch = 0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t = 0 iteration = 0 #w=open(os.path.join(args.data_dir,args.log_file),'w') for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file) for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 # if batch_t<=232240: # if batch_t<=317190: # if (batch_t)%accumulation_steps==0: # iteration+=1 # continue # print('shape: ',his_id.shape,candidate_id.shape,label.shape) # print('candidate_id: ',candidate_id) assert candidate_id.shape[1] == 2 his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss, sample_size = model(his_id, candidate_id, label) sample_size = float(sample_size.sum()) loss = loss.sum() / sample_size / math.log(2) # sample_size=float(sample_size) # loss=loss/sample_size/math.log(2) #print(' batch_t: ',batch_t, ' epoch: ',epoch,' loss: ',float(loss)) #print('???loss',loss) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: #print('candidate_id: ',candidate_id) # total_norm=0 # for p in model.parameters(): # if p.grad==None: # print('error: ',index,p.size(),p.grad) # param_norm = p.grad.data.norm(2) # total_norm += param_norm.item() ** 2 # total_norm = total_norm ** (1. / 2) #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # total_clip_norm=0 # for p in model.parameters(): # if p.grad==None: # print('error: ',index,p.size(),p.grad) # param_norm = p.grad.data.norm(2) # total_clip_norm += param_norm.item() ** 2 # total_clip_norm = total_clip_norm ** (1. / 2) iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) #w.write(' batch_t: '+str(batch_t)+' iteration: '+str(iteration)+' epoch: '+str(epoch)+' accum_batch_loss: '+str(accum_batch_loss/accumulation_steps)+'\n') writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) accum_batch_loss = 0 #assert epoch>=3 # torch.save(model.state_dict(),'./model/Plain_bert_960b_large'+str(epoch)+'.pkl') #writer.add_scalar('Loss/train', float(accum_batch_loss/accumulation_steps), iteration) #break torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def train(model, optimizer, args): print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) cuda_list = range(args.size) accumulation_steps = int(args.batch_size / args.size / args.gpu_size) #model = nn.DataParallel(model, device_ids=cuda_list) # torch.cuda.set_device(cudaid) # torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1) # model=torch.nn.parallel.DistributedDataParallel(model, device_ids=cuda_list,output_device=0,find_unused_parameters=True) model = torch.nn.DataParallel(model) accum_batch_loss = 0 iterator = NewsIterator(batch_size=args.gpu_size * args.size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), field=args.field) train_file = os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t = 0 iteration = 0 print('train...', cuda_list) #w=open(os.path.join(args.data_dir,args.log_file),'w') writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) epoch = 0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t = 0 iteration = 0 step = 0 best_score = -1 #w=open(os.path.join(args.data_dir,args.log_file),'w') # model.eval() # auc=test(model,args) for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file) for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss = model(his_id, candidate_id, label) sample_size = candidate_id.shape[0] loss = loss.sum() / sample_size / math.log(2) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss = 0 if iteration % 2 == 0: torch.cuda.empty_cache() model.eval() auc = test(model, args) print(auc) if auc > best_score: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot_best.pkl')) best_score = auc print('best score: ', best_score) writer.add_scalar('auc/valid', auc, step) step += 1 torch.cuda.empty_cache() model.train() torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def train(cudaid, args, model): pynvml.nvmlInit() dist.init_process_group(backend='nccl', init_method='env://', world_size=args.size, rank=cudaid) random.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #cuda_list=range(args.size) print('rank: ', cudaid) torch.cuda.set_device(cudaid) model.cuda(cudaid) accumulation_steps = int(args.batch_size / args.size / args.gpu_size) #optimizer = torch.optim.Adam(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0) optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.0, max_grad_norm=1.0) model, optimizer = amp.initialize(model, optimizer, opt_level='O2') model = DDP(model) accum_batch_loss = 0 history_file = os.path.join(args.data_dir, args.history_file) if 'last' in args.field: abs_file = os.path.join(args.data_dir, args.abs_file) else: abs_file = '' iterator = NewsIterator(batch_size=args.gpu_size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), history_file=history_file, abs_file=abs_file, field=args.field, fp16=True) train_file = os.path.join(args.data_dir, args.data_file) print('train...', args.field) if cudaid == 0: writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) model.train() #epoch=args.epoch iteration = args.iteration batch_t = args.batch_t step = int(iteration / 500) + 1 best_score = args.best_score start_pos = None #args.batch_t*args.gpu_size#如果不是0的话千万记得加一个% for epoch in range(args.epoch, 12): all_loss = 0 all_batch = 0 if epoch != args.epoch: data_batch = iterator.load_data_from_file(train_file, cudaid, args.size) else: data_batch = iterator.load_data_from_file(train_file, cudaid, args.size, start_pos) print('load ok...') for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 # if cudaid==1: # torch.set_printoptions(profile="full") # print(his_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss = model(his_id, candidate_id, label) sample_size = candidate_id.shape[0] loss = loss.sum() / sample_size / math.log(2) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 # if cudaid==1: # torch.set_printoptions(profile="full") # w=open('input.txt','w') # w.write(str(his_id.cpu())) # w.close() # assert 1==0 loss = loss / accumulation_steps #loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() if cudaid == 0: # handle = pynvml.nvmlDeviceGetHandleByIndex(cudaid) # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) # #print(int(meminfo.used)/1024/1024) # print('loss: ',loss,int(meminfo.used)/1024/1024) print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss = 0 if iteration % 500 == 0 and cudaid == 0: torch.cuda.empty_cache() model.eval() if cudaid == 0: auc = test(model, args) print(auc) writer.add_scalar('auc/valid', auc, step) step += 1 if auc > best_score: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot_best.pkl')) best_score = auc print('best score: ', best_score) torch.cuda.empty_cache() model.train() if cudaid == 0: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def test(model, args): #valid #w=open('train_plain_bert.txt','w') #model=Plain_bert(load_model='roberta.base',output_size=768) #model.cuda(cudaid) model.eval() test_file = os.path.join(args.data_dir, args.data_file) cudaid = args.cudaid w = open(os.path.join(args.data_dir, args.log_file), 'w') #data_batch=utils.get_batch() #cuda_list=range(cuda_num) #model = nn.DataParallel(model, device_ids=cuda_list) preds = [] labels = [] imp_indexes = [] #test_file='valid_ms_roberta_plain.txt' #test_file='valid_ms_roberta_plain_large.txt' #test_file='valid_ms_roberta_plain.txt' feature_file = os.path.join(args.data_dir, args.feature_file) iterator = NewsIterator(batch_size=args.gpu_size, npratio=-1, feature_file=feature_file, field=args.field) print('test...') with torch.no_grad(): data_batch = iterator.load_test_data_from_file(test_file, args.can_length) #data_batch=iterator.load_data_from_file(test_file) batch_t = 0 for imp_index, user_index, his_id, candidate_id, label, can_len in data_batch: batch_t += len(candidate_id) # if batch_t<=167: # continue his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) #rank_mask=rank_mask.cuda(cudaid) #print(his_id.shape,candidate_id.shape,batch_t) # assert 1==0 # imp_index= imp_index.cuda(cudaid) # label= label.cuda(cudaid) #print('???',his_id.shape,candidate_id.shape) #print('???',his_id) #print('???',candidate_id) logit = model.predict(his_id, candidate_id) # print('rank_mask: ',rank_mask) # print('user_index: ',user_index) # print('candidate_id: ',candidate_id) # print('batch_t: ',batch_t) #print('???',logit) #assert 1==0 logit = np.array(logit.cpu()) imp_index = np.reshape(np.array(imp_index), -1) assert len(imp_index) == len(logit) # logit=np.reshape(np.array(logit.cpu()), -1) # label=np.reshape(np.array(label), -1) # imp_index=np.reshape(np.array(imp_index), -1) #print('batch_t:',batch_t) for i in range(len(imp_index)): # w.write('imp_index:'+str(imp_index[i])+' '+' '.join([str(logit[i][j]) for j in range(can_len[i][0])])) # w.write('\n') for j in range(can_len[i][0]): assert len(label[i]) == can_len[i][0] w.write('imp_index: ' + str(imp_index[i]) + ' logit: ' + str(logit[i][j]) + ' label: ' + str(label[i][j]) + '\n') #print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+' label: '+str(label[i][j])) # print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i])) # w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i])+'\n') #assert 1==0 print('imp_index: ', imp_index[-1]) # preds.extend(logit) # labels.extend(label) # imp_indexes.extend(imp_index) # batch_t+=len(candidate_id) #print(labels) #if batch_t==10: #break # group_labels, group_preds = group_labels_func(labels, preds, imp_indexes) # res = cal_metric(group_labels, group_preds, metrics) #return res w.close()
def train(model, optimizer, args): print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #writer = SummaryWriter('./model_snapshot_error') # cuda_list=range(cuda_num) cuda_list = range(args.size) #model.cuda(cudaid) # accumulation_steps=40 accumulation_steps = int(args.batch_size / args.size / 8) #accumulation_steps=1 #model = nn.DataParallel(model, device_ids=cuda_list) accum_batch_loss = 0 iterator = NewsIterator(batch_size=8 * args.size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), field=args.field) train_file = os.path.join(args.data_dir, args.data_file) #for epoch in range(0,100): batch_t = 0 iteration = 0 print('train...', cuda_list) #w=open(os.path.join(args.data_dir,args.log_file),'w') writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) epoch = 0 model.train() # batch_t=52880-1 # iteration=3305-1 batch_t = 0 iteration = 0 #w=open(os.path.join(args.data_dir,args.log_file),'w') for epoch in range(0, 10): #while True: all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file) for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss, sample_size = model(his_id, candidate_id, label) sample_size = float(sample_size.sum()) loss = loss.sum() / sample_size / math.log(2) # sample_size=float(sample_size) # loss=loss/sample_size/math.log(2) #print(' batch_t: ',batch_t, ' epoch: ',epoch,' loss: ',float(loss)) #print('???loss',loss) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 loss = loss / accumulation_steps loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) #w.write(' batch_t: '+str(batch_t)+' iteration: '+str(iteration)+' epoch: '+str(epoch)+' accum_batch_loss: '+str(accum_batch_loss/accumulation_steps)+'\n') writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) accum_batch_loss = 0 #assert epoch>=3 # torch.save(model.state_dict(),'./model/Plain_bert_960b_large'+str(epoch)+'.pkl') #writer.add_scalar('Loss/train', float(accum_batch_loss/accumulation_steps), iteration) #break torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def test(model, args, cudaid): preds = np.array([]) labels = np.array([]) imp_indexes = np.array([]) metrics = ['group_auc'] test_file = os.path.join(args.data_dir, args.test_data_file) preds = [] labels = [] imp_indexes = [] if args.test_feature_file is not None: feature_file = os.path.join(args.data_dir, args.test_feature_file) else: feature_file = os.path.join(args.data_dir, args.feature_file) iterator = NewsIterator(batch_size=1, npratio=-1, feature_file=feature_file, field=args.field, fp16=True) print('test...') #cudaid=0 #model = nn.DataParallel(model, device_ids=list(range(args.size))) step = 0 with torch.no_grad(): data_batch = iterator.load_test_data_from_file(test_file, None, rank=cudaid, size=args.size) batch_t = 0 for imp_index, user_index, his_id, candidate_id, label, _ in data_batch: batch_t += len(candidate_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) logit = model(his_id, candidate_id, None, mode='validation') #print('???',his_id.shape,logit.shape,candidate_id.shape) # logit=list(np.reshape(np.array(logit.data.cpu()), -1)) # label=list(np.reshape(np.array(label), -1)) # imp_index=list(np.reshape(np.array(imp_index), -1)) logit = np.reshape(np.array(logit.data.cpu()), -1) label = np.reshape(np.array(label), -1) #imp_index=np.reshape(np.array(imp_index), -1) assert len(imp_index) == 1 #imp_index=imp_index*len(logit) imp_index = np.repeat(imp_index, len(logit)) assert len(logit) == len(label), (len(logit), len(label)) assert len(logit) == len(imp_index) assert np.sum(label) != 0 # labels.extend(label) # preds.extend(logit) # imp_indexes.extend(imp_index) labels = np.concatenate((labels, label), axis=0) preds = np.concatenate((preds, logit), axis=0) imp_indexes = np.concatenate((imp_indexes, imp_index), axis=0) step += 1 if step % 100 == 0: print('all data: ', len(labels), cudaid) #return labels,preds,imp_indexes # group_labels, group_preds = group_labels_func(labels, preds, imp_indexes) # res = cal_metric(group_labels, group_preds, metrics) # return res['group_auc'] return labels, preds, imp_indexes