def main(args): since = time.time() output_dir = os.path.join(os.getcwd(), 'outputs') os.makedirs(output_dir, exist_ok=True) data_loaders = get_dataloader( input_dir=args.input_dir, which_challenge='3rd_challenge', phases=['test'], max_frame_length=args.max_frame_length, max_vid_label_length=args.max_vid_label_length, max_seg_label_length=args.max_seg_label_length, rgb_feature_size=args.rgb_feature_size, audio_feature_size=args.audio_feature_size, batch_size=args.batch_size, num_workers=args.num_workers) model = TransformerModel( n_layers=args.n_layers, n_heads=args.n_heads, rgb_feature_size=args.rgb_feature_size, audio_feature_size=args.audio_feature_size, d_rgb=args.d_rgb, d_audio=args.d_audio, d_model=args.d_model, d_ff=args.d_ff, d_proj=args.d_proj, n_attns = args.n_attns, num_classes=args.num_classes, dropout=args.dropout) model = model.to(device) checkpoint = torch.load(os.path.join(os.getcwd(), 'models/model-epoch-04.ckpt')) model.load_state_dict(checkpoint['state_dict']) model.eval() df_outputs = {i: pd.DataFrame(columns=['vid_id', 'vid_label_pred', 'vid_prob', 'seg_label_pred', 'seg_prob']) \ for i in range(1, args.num_classes+1)} for idx, (vid_ids, frame_lengths, frame_rgbs, frame_audios, vid_labels, seg_labels, seg_times) \ in enumerate(data_loaders['test']): if idx%10 == 0: print('idx:', idx) # frame_rgbs: [batch_size, frame_length, rgb_feature_size] # frame_audios: [batch_size, frame_length, audio_feature_size] frame_rgbs = frame_rgbs.to(device) frame_audios = frame_audios.to(device) batch_size = frame_audios.size(0) # vid_probs: [batch_size, num_classes] # attn_idc: [batch_size, num_classes] # scores: [batch_size, max_seg_length, n_attns] # attn_weights: [batch_size, max_seg_length, n_attns] vid_probs, attn_idc, scores, attn_weights, conv_loss = model(frame_rgbs, frame_audios, device) # vid_probs: [batch_size, vid_pred_length] # vid_label_preds: [batch_size, vid_pred_length] vid_probs, vid_label_preds = torch.topk(vid_probs, args.vid_pred_length) vid_label_preds = vid_label_preds + 1 # attn_idc: [batch_size, num_classes+1] zeros = torch.zeros(batch_size, 1).long().to(device) attn_idc = torch.cat((zeros, attn_idc), dim=1) # selected_attn_idc: [batch_size, vid_pred_length] selected_attn_idc = torch.gather(attn_idc, 1, vid_label_preds) # attn_weights: [batch_size, n_attns, max_seg_length] attn_weights = attn_weights.transpose(1, 2) # selected_attn_weights: [batch_size, vid_pred_length, max_seg_length] selected_attn_weights = batched_index_select(attn_weights, 1, selected_attn_idc) # seg_probs: [batch_size, vid_pred_length, seg_pred_length] # seg_label_preds: [batch_size, vid_pred_length, seg_pred_length] seg_probs, seg_label_preds = torch.topk(selected_attn_weights, args.seg_pred_length) seg_label_preds = seg_label_preds + 1 # seg_prob_min, seg_prob_max: [batch_size, vid_pred_length] seg_prob_min, _ = seg_probs.min(dim=2) seg_prob_max, _ = seg_probs.max(dim=2) # seg_prob_min, seg_prob_max: [batch_size, vid_pred_length, seg_pred_length] seg_prob_min = seg_prob_min.unsqueeze(2).expand(batch_size, args.vid_pred_length, args.seg_pred_length) seg_prob_max = seg_prob_max.unsqueeze(2).expand(batch_size, args.vid_pred_length, args.seg_pred_length) # seg_probs: [batch_size, vid_pred_length, seg_pred_length] seg_probs = (seg_probs - seg_prob_min) / (seg_prob_max - seg_prob_min + 1e-6) # To save predictions, converted to numpy data. vid_probs = vid_probs.cpu().detach().numpy() vid_label_preds = vid_label_preds.cpu().numpy() seg_probs = seg_probs.cpu().detach().numpy() seg_label_preds = seg_label_preds.cpu().numpy() for i in range(batch_size): for j in range(args.vid_pred_length): vid_label_pred = vid_label_preds[i][j] df_outputs[vid_label_pred] = df_outputs[vid_label_pred].append( {'vid_id': vid_ids[i], 'vid_label_pred': vid_label_pred, 'vid_prob': vid_probs[i][j], 'seg_label_pred': list(seg_label_preds[i][j]), 'seg_prob': list(seg_probs[i][j])}, ignore_index=True) for i in range(1, args.num_classes+1): df_outputs[i].to_csv(os.path.join(output_dir, '%04d.csv'%i), index=False) time_elapsed = time.time() - since print('=> Running time in a epoch: {:.0f}h {:.0f}m {:.0f}s' .format(time_elapsed // 3600, (time_elapsed % 3600) // 60, time_elapsed % 60))
inp, target = get_data_tensor(data, country, measure_mode, output_mode=output_mode, cuda=cuda) out_nn, _ = get_net_output(inp, model_type, model, cuda) temp_loss = criterion(out_nn, target) loss += temp_loss if (it + 1) % batch_size == 0: loss.backward() optimizer.step() train_loss_seq.append(loss.item()/batch_size) # Test model.eval() test_loss = 0.0 for c in test_countries: inp, target = get_data_tensor(data, c, measure_mode, output_mode=output_mode, cuda=cuda) out_nn, _ = get_net_output(inp, model_type, model, cuda) test_loss += criterion(out_nn, target) test_loss_seq.append(test_loss.item()/len(test_countries)) if test_loss_seq[-1] < min_test_loss: min_test_loss = test_loss_seq[-1] best_state_dict = copy.deepcopy(model.state_dict()) if nnet == 0: nets_min_test_loss = -1.
class Trainer(TrainerBase): def __init__(self, args): super(Trainer, self).__init__() self.args = args with open( os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), args.config_path), "r") as fr: self.config = json.load(fr) self.train_data_obj = None self.eval_data_obj = None self.model = None # save_path模型保存目录 self.save_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), self.config["ckpt_model_path"]) if not os.path.exists(self.save_path): os.makedirs(self.save_path) # self.builder = tf.saved_model.builder.SavedModelBuilder("../pb_model/weibo/bilstm/savedModel") # 加载数据集 self.load_data() self.train_inputs, self.train_labels, label_to_idx = self.train_data_obj.gen_data( ) print("train data size: {}".format(len(self.train_labels))) self.vocab_size = self.train_data_obj.vocab_size print("vocab size: {}".format(self.vocab_size)) self.word_vectors = self.train_data_obj.word_vectors self.label_list = [value for key, value in label_to_idx.items()] self.eval_inputs, self.eval_labels = self.eval_data_obj.gen_data() print("eval data size: {}".format(len(self.eval_labels))) print("label numbers: ", len(self.label_list)) # 初始化模型对象 self.create_model() def load_data(self): """ 创建数据对象 :return: """ # 生成训练集对象并生成训练数据 self.train_data_obj = TrainData(self.config) # 生成验证集对象和验证集数据 self.eval_data_obj = EvalData(self.config) def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ if self.config["model_name"] == "textcnn": self.model = TextCnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm": self.model = BiLstmModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "bilstm_atten": self.model = BiLstmAttenModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "rcnn": self.model = RcnnModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) elif self.config["model_name"] == "transformer": self.model = TransformerModel(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) def train(self): """ 训练模型 :return: """ gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True) sess_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options) with tf.Session(config=sess_config) as sess: # 初始化变量值 sess.run(tf.global_variables_initializer()) current_step = 0 eval_loss_lis = [0] # 创建train和eval的summary路径和写入对象 train_summary_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), self.config["output_path"] + "/summary/train") if not os.path.exists(train_summary_path): os.makedirs(train_summary_path) train_summary_writer = tf.summary.FileWriter( train_summary_path, sess.graph) eval_summary_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), self.config["output_path"] + "/summary/eval") if not os.path.exists(eval_summary_path): os.makedirs(eval_summary_path) eval_summary_writer = tf.summary.FileWriter( eval_summary_path, sess.graph) for epoch in range(self.config["epochs"]): print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) for batch in self.train_data_obj.next_batch( self.train_inputs, self.train_labels, self.config["batch_size"]): summary, loss, predictions = self.model.train( sess, batch, self.config["keep_prob"], self.config['learning_rate']) train_summary_writer.add_summary(summary) current_step += 1 if self.config[ "num_classes"] == 1 and current_step % self.config[ "print_every"] == 0: acc, auc, recall, prec, f_beta = get_binary_metrics( pred_y=predictions, true_y=batch["y"]) print( "train: step: {}, loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}" .format(current_step, loss, acc, auc, recall, prec, f_beta)) elif self.config[ "num_classes"] > 1 and current_step % self.config[ "print_every"] == 0: acc, recall, prec, f_beta = get_multi_metrics( pred_y=predictions, true_y=batch["y"], labels=self.label_list) print( "train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}" .format(current_step, loss, acc, recall, prec, f_beta)) #每训练一个epoch输出验证集的评测结果 if self.eval_data_obj: eval_losses = [] eval_accs = [] eval_aucs = [] eval_recalls = [] eval_precs = [] eval_f_betas = [] for eval_batch in self.eval_data_obj.next_batch( self.eval_inputs, self.eval_labels, self.config["batch_size"]): eval_summary, eval_loss, eval_predictions = self.model.eval( sess, eval_batch) eval_summary_writer.add_summary(eval_summary) eval_losses.append(eval_loss) if self.config["num_classes"] == 1: acc, auc, recall, prec, f_beta = get_binary_metrics( pred_y=eval_predictions, true_y=eval_batch["y"]) eval_accs.append(acc) eval_aucs.append(auc) eval_recalls.append(recall) eval_precs.append(prec) eval_f_betas.append(f_beta) elif self.config["num_classes"] > 1: acc, recall, prec, f_beta = get_multi_metrics( pred_y=eval_predictions, true_y=eval_batch["y"], labels=self.label_list) eval_accs.append(acc) eval_recalls.append(recall) eval_precs.append(prec) eval_f_betas.append(f_beta) eval_loss_lis.append(mean(eval_losses)) print("\n") print( "eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}" .format(mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls), mean(eval_precs), mean(eval_f_betas))) print("\n") if self.config["ckpt_model_path"] and eval_loss_lis[ -1] >= max(eval_loss_lis): #self.model_save_path是模型保存具体的名字 self.model_save_path = os.path.join( self.save_path, self.config["model_name"]) self.model.saver.save(sess, self.model_save_path, global_step=epoch + 1) elif self.config["ckpt_model_path"] and eval_loss_lis[ -1] < max(eval_loss_lis): if self.config['batch_size'] <= 256: self.config['batch_size'] *= 2 if self.config['learning_rate'] <= 0.00001: self.config['learning_rate'] *= 0.95 print( "epoch: {} lr: {} self.batch_size: {}".format( epoch, self.lr, self.batch_size)) self.save_path = tf.train.latest_checkpoint( self.save_path) print('最新加载的模型路径{}'.format(self.save_path)) else: print('learn_rate 小于0.00001,训练结束')