def __init__(self, data, model_config, learning_config, pretrained_weight, early_stopping=True, patience=100, json_path=None, vocab_path=None, mapping_path=None, odir=None): self.data = data self.model_config = model_config # max length of a sequence (max nodes among graphs) self.seq_max_length = data[MAX_N_NODES] self.learning_config = learning_config self.pretrained_weight = pretrained_weight self.is_cuda = learning_config['cuda'] # with open(vocab_path+'/../mapping.json', 'r') as f: with open(mapping_path, 'r') as f: self.mapping = json.load(f) self.labels = self.data[LABELS] self.graphs_names = self.data[GNAMES] data_graph = self.data[GRAPH] data_nclasses = self.data[N_CLASSES] if N_RELS in self.data: data_nrels = self.data[N_RELS] else: data_nrels = None if N_ENTITIES in self.data: data_nentities = self.data[N_ENTITIES] else: data_nentities = None self.model = Model(g=data_graph, config_params=model_config, n_classes=data_nclasses, n_rels=data_nrels, n_entities=data_nentities, is_cuda=self.is_cuda, seq_dim=self.seq_max_length, batch_size=1, json_path=json_path, vocab_path=vocab_path) if early_stopping: self.early_stopping = EarlyStopping(patience=patience, verbose=True) # Output folder to save train / test data if odir is None: odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S") self.odir = odir
def fit(self, trajectories, iteration): self._set_logging(iteration) # self.optimizer = torch.optim.Adam( # filter(lambda x: x.requires_grad, self.model.parameters()), # lr=self.args.vae_lr, # ) dataset_train, dataset_test = self.preprocess_trajectories( trajectories) num_trajectories = len(dataset_train) batch_size = num_trajectories // self.args.vae_batches loader_train = torch.utils.data.DataLoader(dataset_train, shuffle=True, batch_size=batch_size, num_workers=2) loader_test = torch.utils.data.DataLoader(dataset_test, shuffle=False, batch_size=batch_size, num_workers=2) num_max_epoch = self.args.vae_max_fit_epoch if 'Point2D' in self.args.env_name: min_delta = 0.05 else: min_delta = 0.005 # TODO: tune early_stopping = EarlyStopping(mode='min', min_delta=min_delta, patience=num_max_epoch // 10) t = tqdm(range(num_max_epoch)) for i_epoch in t: loss_train = self._train(loader_train, i_epoch) t.set_description('train loss: {}'.format(loss_train)) if i_epoch == 0 or (i_epoch + 1) % (num_max_epoch // 5) == 0: loss_test = self._eval(loader_test, i_epoch) # print('epoch: {}\tloss: {}'.format(i_epoch, losses.avg)) t.write('epoch: {}\ttrain loss: {}\ttest_loss: {}'.format( i_epoch, loss_train, loss_test)) if i_epoch > num_max_epoch // 5: if early_stopping.step( loss_train): # doesn't start tracking until epoch 300 t.close() break model = copy.deepcopy(self.model).cpu() mean = self.mean.clone() std = self.std.clone() torch.save(dict(model=model, mean=mean, std=std), self.filename) print('wrote vae model to {}'.format(self.filename))
def train(params, m, data_x, data_y): es = EarlyStopping(min_delta = params.min_delta, patience = params.patience) # optimizer optimizer = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), lr = params.init_learning_rate) n_batch = data_x.train_size // params.bs if data_x.train_size % params.bs == 0 else data_x.train_size // params.bs + 1 data_idxs = list(range(data_x.train_size)) # number of iterations cur_it = 0 # write to tensorboard writer = SummaryWriter('./history/{}'.format(params.emb_out_path)) if params.write_tfboard else None nll_dev = math.inf best_nll_dev = math.inf kld_dev = math.inf for i in range(params.ep): shuffle(data_idxs) for j in range(n_batch): train_idxs = data_idxs[j * params.bs: (j + 1) * params.bs] # get padded & sorted batch idxs and padded_batch_x, batch_x_lens = get_batch(train_idxs, data_x, data_x.train_idxs, data_x.train_lens, params.cuda) padded_batch_y, batch_y_lens = get_batch(train_idxs, data_y, data_y.train_idxs, data_y.train_lens, params.cuda) optimizer.zero_grad() m.train() nll_batch, kld_batch = m(padded_batch_x, batch_x_lens, padded_batch_y, batch_y_lens) cur_it += 1 loss_batch, alpha = calc_loss_batch(params, nll_batch, kld_batch, cur_it, n_batch) loss_batch.backward() optimizer.step() out_parallel(i, j, n_batch, loss_batch, nll_batch, kld_batch, best_nll_dev, nll_dev, kld_dev, es.num_bad_epochs) update_tensorboard(writer, loss_batch, nll_batch, kld_batch, alpha, nll_dev, kld_dev, cur_it) if cur_it % params.VAL_EVERY == 0: sys.stdout.write('\n') sys.stdout.flush() # validation nll_dev, kld_dev = test(params, m, data_x, data_y) if es.step(nll_dev): print('\nEarly Stoped.') return elif es.is_better(nll_dev, best_nll_dev): best_nll_dev = nll_dev # save model m.save_embedding(params, data_x, 'x') m.save_embedding(params, data_y, 'y') m.save_model(params, data_x, data_y, optimizer)
def __init__(self, teacher_model, student_model, device, config, fold_num): self.config = config self.epoch = 0 self.start_epoch = 0 self.fold_num = fold_num if self.config.stage2: self.base_dir = f'./result/stage2/{config.dir}/{config.dir}_fold_{config.fold_num}' else: self.base_dir = f'./result/{config.dir}/{config.dir}_fold_{config.fold_num}' os.makedirs(self.base_dir, exist_ok=True) self.log_path = f'{self.base_dir}/log.txt' self.best_summary_loss = 10**5 self.teacher_model = teacher_model self.teacher_mode.eval() self.student_model = student_model self.device = device self.wandb = True self.cutmix = self.config.cutmix_ratio self.fmix = self.config.fmix_ratio self.smix = self.config.smix_ratio self.es = EarlyStopping(patience=5) self.scaler = GradScaler() self.amp = self.config.amp param_optimizer = list(self.student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optimizer, self.scheduler = get_optimizer( self.student_model, self.config.optimizer_name, self.config.optimizer_params, self.config.scheduler_name, self.config.scheduler_params, self.config.n_epochs) self.criterion = get_criterion(self.config.criterion_name, self.config.criterion_params) self.log(f'Fitter prepared. Device is {self.device}') set_wandb(self.config, fold_num)
def train_model(args: Namespace): random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) for fold in args.folds: es = EarlyStopping(patience=args.es_patience) print(f'STARTING FOLD {fold}') torch.cuda.empty_cache() run_name = args.run_name + '_' + str(fold) device = torch.device('cuda:0') model, optimizer = load_model(args, fold) train_loader, val_loader = get_dataloaders(args, fold) loss_function = cross_entropy scheduler = define_lr_scheduler(args, optimizer, train_loader) train_loss = AverageMeter() dev_loss = AverageMeter() log = get_logger('zindi' + ":" + run_name) checkpoint_path = 'checkpoints/' + run_name checkpointer = ModelCheckpoint(checkpoint_path, 'checkpoint', n_saved=5, score_name='NLL_loss', save_as_state_dict=False, require_empty=False) for epoch in range(args.max_epochs): train_loss.reset() dev_loss.reset() train(args, model, device, train_loader, optimizer, scheduler, loss_function, epoch, train_loss, log) dev_log_loss = validate(model, device, val_loader, loss_function, epoch, dev_loss, log) checkpointer( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'model_class': args.model_class, 'model_hyperparams': args.model_hyperparams, 'optimizer_state_dict': optimizer.state_dict(), 'loss': dev_loss.value }, score=-dev_log_loss) if es.step(dev_log_loss): print('EARLY STOPPING ON EPOCH ' + str(epoch)) break
def runEpoch(loader, model, loss_fn, optimizer, scheduler, device, vis, epoch, iFold, folds_pbar, avg_training_loss, avg_training_score, logger_options, optimizer_options, msg_dict): ## ======================================= Early Stop ======================================= ## early_stop = False if not (optimizer_options['early_stopping'] == ""): #['min', '0.01', '21'] mode = optimizer_options['early_stopping'][0] min_delta = float(optimizer_options['early_stopping'][1]) patience = int(optimizer_options['early_stopping'][2]) early_stopping = EarlyStopping(mode=mode, min_delta=min_delta, patience=patience) ## ======================================= Early Stop ======================================= ## trainer = Engine(model, optimizer, loss_fn, scheduler, loader, optimizer_options["accumulate_count"], device, use_half_precision=optimizer_options["use_half_precision"], score_type="f1") iteration_pbar = ProgressBar(loader, desc="Iteration", pb_len=optimizer_options['max_iterations']) max_iterations = iteration_pbar.total for iteration, data_dict in enumerate(iteration_pbar): images = data_dict['data'] phase_annotations = data_dict['target'] ### ============================== Training ============================== ### train_loss, train_score = trainer(images.to(device=device), phase_annotations.to(device=device)) avg_training_loss.update(train_loss) avg_training_score.update(train_score) msg_dict['ATL'] = avg_training_loss.get_value()[0] msg_dict['ATS'] = avg_training_score.get_value()[0] ### ============================== Training ============================== ### ### ============================== Plot ============================== ### if ((iteration) % logger_options["vislogger_interval"] == 0): # print(avg_training_loss.get_value()[0]) vis.line(X=np.array([epoch + (iteration/iteration_pbar.total)]), Y=np.array([avg_training_loss.get_value()[0]]), update='append', win='Training_Loss_Fold_'+str(iFold+1), name='Training Loss Fold '+str(iFold+1)) ### ============================== Plot ============================== ### if early_stop: iteration_pbar.close() print("\n==========================\nEarly stop\n==========================\n") break folds_pbar.update_message(msg_dict=msg_dict) if iteration == max_iterations: iteration_pbar.refresh() iteration_pbar.close() break
def train(self, image, epochs, enable_es=1): graph = tf.Graph() with tf.Session(graph=graph) as session: tf.set_random_seed(1234) self.__create_inputs() new_saver = self.__create_graph(self.meta_file) self.__create_loss_optimizer() # slim.model_analyzer.analyze_vars(tf.trainable_variables() , print_info=True) early_stopping = EarlyStopping(patience=30, min_delta=1e-1) tf.global_variables_initializer().run() new_saver.restore(session,self.latest_checkpoint) recons_loss = list() print('Starting optimization...') for cur_epoch in range(epochs + 1): dict_loss = self.__train_epoch(session,image) list_loss = list(dict_loss.values()) if np.isnan(list_loss[0]): print ('Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.') sys.exit() if(cur_epoch % 20 == 0 or cur_epoch==0): print('EPOCH: {} | dist: {} '.format(cur_epoch, list_loss[0])) recons_loss.append(list_loss[0]) #Early stopping if(cur_epoch>50 and enable_es==1 and early_stopping.stop(list_loss[0])): print('Early Stopping!') print('EPOCH: {} | dist: {} '.format(cur_epoch, list_loss[0])) break z_infer = session.run(self.z) x_recons = session.run(self.x_recons) return z_infer, x_recons, recons_loss
def training(model, epoches, lr, wd): if torch.cuda.is_available(): model.cuda() optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) criterion = nn.CrossEntropyLoss() early_stopper = EarlyStopping(model_dir, patience=PATIENCE) for ep in range(epoches): model = train_epoch(ep, model, optimizer, criterion, early_stopper) optimizer = learning_rate_decay(optimizer) if early_stopper.early_stop: return model return model
def run_ppi(train_loader, val_loader, test_loader, model, epochs, lr, weight_decay, patience, device, logger=True): model.to(device) optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, # mode='min', # factor=0.5, # patience=20) early_stopping = EarlyStopping(patience=patience) # path1 = osp.join(osp.dirname(osp.realpath(__file__)), 'runs', 'train') # path2 = osp.join(osp.dirname(osp.realpath(__file__)), 'runs', 'val') # writer_train = SummaryWriter(path1) # writer_val = SummaryWriter(path2) for epoch in range(1, epochs + 1): train_loss = train_ppi(model, optimizer, train_loader, device) val_loss = val_ppi(model, val_loader, device) test_f1 = test_ppi(model, test_loader, device) # scheduler.step(val_loss) # writer_train.add_scalar('training loss', train_loss, epoch) # writer_val.add_scalar('val loss', val_loss, epoch) if logger: print( '{:03d}: Train Loss: {:.4f}, Val Loss: {:.4f}, Test F1: {:.4f}' .format(epoch, train_loss, val_loss, test_f1)) early_stopping(val_loss, test_f1) if early_stopping.early_stop: best_val_loss = early_stopping.best_score best_test_f1 = early_stopping.best_score_acc print('Val Loss: {:.3f}, Test F1 Score: {:.4f}'.format( best_val_loss, best_test_f1)) break
def run_std(runs, file_name, **kwargs): train_accs, val_accs, test_accs = [], [], [] for i in range(runs): kwargs["model"].reset_parameters() es = EarlyStopping(patience=20) train_node_acc, val_node_acc, test_node_acc = trainer( early_stopping=es, **kwargs) train_accs.append(train_node_acc) val_accs.append(val_node_acc) test_accs.append(test_node_acc) with open(file_name, "w") as std_file: std_file.write(f"{np.mean(train_accs)}, {np.std(train_accs)}\n") std_file.write(f"{np.mean(val_accs)}, {np.std(val_accs)}\n") std_file.write(f"{np.mean(test_accs)}, {np.std(test_accs)}\n")
def _get_early_stopper(self): return EarlyStopping(self.config['stage%d' % self.stage]['stopper']['patience'])
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader train_data_set = TrainDataSet(X_train, y_train, CONV_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_set = TrainDataSet(X_dev, y_dev, CONV_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") pred_list_test_best = None final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False # Model is defined in HierarchicalPredictor model = HierarchicalPredictor(SENT_EMB_DIM, SENT_HIDDEN_SIZE, num_of_vocab, USE_ELMO=True, ADD_LINEAR=False) model.load_embedding(emb) model.deepmoji_model.load_specific_weights( PRETRAINED_PATH, exclude_names=['output_layer']) model.cuda() # model = nn.DataParallel(model) # model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) # # optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma) if opt.w == 1: weight_list = [0.3, 0.3, 0.3, 1.7] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] elif opt.w == 2: weight_list = [ 0.3198680179, 0.246494733, 0.2484349259, 1.74527696 ] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] else: raise ValueError weight_list = [x**FLAT for x in weight_list] weight_label = torch.Tensor(weight_list).cuda() weight_list_binary = [x**FLAT for x in weight_list_binary] weight_binary = torch.Tensor(weight_list_binary).cuda() print('classification reweight: ', weight_list) print('binary loss reweight = weight_list_binary', weight_list_binary) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=False) loss_criterion_binary = FocalLoss(gamma=opt.focal, reduce=False) # elif opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss(reduce=False) loss_criterion_binary = nn.CrossEntropyLoss(reduce=False) # loss_criterion_emo_only = nn.MSELoss() es = EarlyStopping(patience=EARLY_STOP_PATIENCE) # best_model = None final_pred_list_test = None pred_list_test = None for num_epoch in range(MAX_EPOCH): # to ensure shuffle at ever epoch train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) print('Begin training epoch:', num_epoch, end='...\t') sys.stdout.flush() # stepping scheduler scheduler.step(num_epoch) print('Current learning rate', scheduler.get_lr()) train_loss = 0 model.train() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo) \ in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE): optimizer.zero_grad() elmo_a = elmo_encode(a) elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) # loss = torch.matmul(torch.gather(weight, 0, trg.view(-1).cuda()), loss) / trg.view(-1).shape[0] # training trilogy loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss.data.cpu().numpy() * a.shape[0] del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo # Evaluate model.eval() dev_loss = 0 # pred_list = [] # gold_list = [] for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo)\ in enumerate(dev_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) loss_label = loss_criterion( pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul( torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only( pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) dev_loss += loss.data.cpu().numpy() * a.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is not None: del pred_list_test_best pred_list_test_best = deepcopy(pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is None: pred_list_test_best = deepcopy(pred_list_test) # Gold Dev testing... print('Gold Dev testing....') pred_list_test = [] model.eval() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c) in enumerate(gold_dev_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) # , __id2word=ex_id2word elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) pred_list_test.append(pred.data.cpu().numpy()) del elmo_a, elmo_b, elmo_c, a, b, c, pred pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test) # Testing print('Gold test testing...') final_pred_list_test = [] model.eval() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c) in enumerate(test_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) # , __id2word=ex_id2word elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) final_pred_list_test.append(pred.data.cpu().numpy()) del elmo_a, elmo_b, elmo_c, a, b, c, pred final_pred_list_test = np.argmax(np.concatenate( final_pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test) if is_diverged: print("Reinitialize model ...") del model continue all_fold_results.append(pred_list_test_best) real_test_results.append(final_pred_best) del model break
def __init__(self, data, model_config, learning_config, pretrained_weight, early_stopping=True, patience=100, json_path=None, pickle_folder=None, vocab_path=None, mapping_path=None, odir=None, model_src_path=None, gdot_path=None): if model_src_path is not None: sys.path.insert(0, model_src_path) print('*** [app][__init__] model_src_path', model_src_path) from model_edgnn_o import Model else: from models.model_edgnn_o import Model print('*** [app][__init__] gdot_path', gdot_path) self.data = data self.model_config = model_config # max length of a sequence (max nodes among graphs) self.learning_config = learning_config self.pretrained_weight = pretrained_weight self.is_cuda = learning_config['cuda'] # with open(vocab_path+'/../mapping.json', 'r') as f: with open(mapping_path, 'r') as f: self.mapping = json.load(f) self.labels = self.data[LABELS] self.graphs_names = self.data[GNAMES] self.data_graph = self.data[GRAPH] # save nid and eid to nodes & edges # print('self.data_graph[0]', self.data_graph[0]) # if 'nid' not in self.data_graph[0].ndata: # # if True: # for k,g in enumerate(self.data_graph): # g = self.write_nid_eid(g) # self.data_graph[k] = g # # print('self.data_graph', self.data_graph) # save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH)) self.data_nclasses = self.data[N_CLASSES] if N_RELS in self.data: self.data_nrels = self.data[N_RELS] else: self.data_nrels = None if N_ENTITIES in self.data: self.data_nentities = self.data[N_ENTITIES] else: self.data_nentities = None self.ModelObj = Model self.model_src_path = model_src_path self.model = self.ModelObj( g=self.data_graph[0], config_params=self.model_config, n_classes=self.data_nclasses, n_rels=self.data_nrels, n_entities=self.data_nentities, is_cuda=self.is_cuda, batch_size=1, # json_path=json_path, # vocab_path=vocab_path, model_src_path=model_src_path) if self.is_cuda is True: print('[app][__init__] Convert model to use cuda') self.model = self.model.cuda() # self.model = self.model.to(torch.device('cuda:{}'.format(self.learning_config['gpu']))) print('>>> [app][__init__] self.model', self.model) print('>>> [app][__init__] Check if model use cuda', next(self.model.parameters()).is_cuda) # print('*** [app][__init__] Model parameters ***') # pp=0 # for p in list(self.model.parameters()): # nn=1 # for s in list(p.size()): # # print('p', p) # print('\t s, nn, nn*s', s, nn, nn*s) # nn = nn*s # pp += nn # print('[app][__init__] Total params', pp) if early_stopping: self.early_stopping = EarlyStopping(patience=patience, verbose=True) # Output folder to save train / test data if odir is None: odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S") self.odir = odir
class App: """ App inference """ TRAIN_SIZE = 0.7 def __init__(self, data, model_config, learning_config, pretrained_weight, early_stopping=True, patience=100, json_path=None, pickle_folder=None, vocab_path=None, mapping_path=None, odir=None, model_src_path=None, gdot_path=None): if model_src_path is not None: sys.path.insert(0, model_src_path) print('*** [app][__init__] model_src_path', model_src_path) from model_edgnn_o import Model else: from models.model_edgnn_o import Model print('*** [app][__init__] gdot_path', gdot_path) self.data = data self.model_config = model_config # max length of a sequence (max nodes among graphs) self.learning_config = learning_config self.pretrained_weight = pretrained_weight self.is_cuda = learning_config['cuda'] # with open(vocab_path+'/../mapping.json', 'r') as f: with open(mapping_path, 'r') as f: self.mapping = json.load(f) self.labels = self.data[LABELS] self.graphs_names = self.data[GNAMES] self.data_graph = self.data[GRAPH] # save nid and eid to nodes & edges # print('self.data_graph[0]', self.data_graph[0]) # if 'nid' not in self.data_graph[0].ndata: # # if True: # for k,g in enumerate(self.data_graph): # g = self.write_nid_eid(g) # self.data_graph[k] = g # # print('self.data_graph', self.data_graph) # save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH)) self.data_nclasses = self.data[N_CLASSES] if N_RELS in self.data: self.data_nrels = self.data[N_RELS] else: self.data_nrels = None if N_ENTITIES in self.data: self.data_nentities = self.data[N_ENTITIES] else: self.data_nentities = None self.ModelObj = Model self.model_src_path = model_src_path self.model = self.ModelObj( g=self.data_graph[0], config_params=self.model_config, n_classes=self.data_nclasses, n_rels=self.data_nrels, n_entities=self.data_nentities, is_cuda=self.is_cuda, batch_size=1, # json_path=json_path, # vocab_path=vocab_path, model_src_path=model_src_path) if self.is_cuda is True: print('[app][__init__] Convert model to use cuda') self.model = self.model.cuda() # self.model = self.model.to(torch.device('cuda:{}'.format(self.learning_config['gpu']))) print('>>> [app][__init__] self.model', self.model) print('>>> [app][__init__] Check if model use cuda', next(self.model.parameters()).is_cuda) # print('*** [app][__init__] Model parameters ***') # pp=0 # for p in list(self.model.parameters()): # nn=1 # for s in list(p.size()): # # print('p', p) # print('\t s, nn, nn*s', s, nn, nn*s) # nn = nn*s # pp += nn # print('[app][__init__] Total params', pp) if early_stopping: self.early_stopping = EarlyStopping(patience=patience, verbose=True) # Output folder to save train / test data if odir is None: odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S") self.odir = odir def write_nid_eid(self, g): num_nodes = g.number_of_nodes() num_edges = g.number_of_edges() g.ndata['nid'] = torch.tensor([-1] * num_nodes) g.edata['eid'] = torch.tensor([-1] * num_edges) # print("self.g.ndata['nid']", g.ndata['nid']) # save nodeid and edgeid to each node and edge for nid in range(num_nodes): g.ndata['nid'][nid] = torch.tensor([nid]).type(torch.LongTensor) for eid in range(g.number_of_edges()): g.edata['eid'][eid] = torch.tensor([eid]).type(torch.LongTensor) return g def train(self, save_path='', k_fold=10, train_list_file=None, test_list_file=None): if self.pretrained_weight is not None: self.model = load_checkpoint(self.model, self.pretrained_weight, self.is_cuda) save_dir = save_path.split('/checkpoint')[0] loss_fcn = torch.nn.CrossEntropyLoss() # initialize graphs self.accuracies = np.zeros(k_fold) graphs = self.data[GRAPH] # load all the graphs # debug purposes: reshuffle all the data before the splitting random_indices = list(range(len(graphs))) random.shuffle(random_indices) graphs = [graphs[i] for i in random_indices] labels = self.labels[random_indices] graphs_names = [self.graphs_names[i] for i in random_indices] split_train_test = True if train_list_file is None and test_list_file is None else False print('[app][train] split_train_test', split_train_test) ''' if split_train_test is True: print('[app][train] train_list_file', train_list_file) print('[app][train] test_list_file', test_list_file) ############################# # Create new train/test set # Split train and test ############################# train_size = int(self.TRAIN_SIZE * len(graphs)) g_train = graphs[:train_size] l_train = labels[:train_size] n_train = graphs_names[:train_size] g_test = graphs[train_size:] l_test = labels[train_size:] n_test = graphs_names[train_size:] else: ############################# # Load train and test graphs from list ############################# train_files = [] test_files = [] g_train = [] l_train = [] n_train = [] g_test = [] l_test = [] n_test = [] with open(train_list_file, 'r') as f: train_files = [l.strip() for l in f.readlines()] with open(test_list_file, 'r') as f: test_files = [l.strip() for l in f.readlines()] for i in range(len(labels)): graph_jsonpath = graphs_names[i] # print(graph_jsonpath) if graph_jsonpath in train_files: g_train.append(graphs[i]) l_train.append(labels[i]) n_train.append(graphs_names[i]) if graph_jsonpath in test_files: g_test.append(graphs[i]) l_test.append(labels[i]) n_test.append(graphs_names[i]) l_train = torch.Tensor(l_train).type(torch.LongTensor) l_test = torch.Tensor(l_test).type(torch.LongTensor) if self.is_cuda is True: l_train = l_train.cuda() l_test = l_test.cuda() ''' print('[app][train] len labels', len(labels)) print('[app][train] len g_train', len(g_train)) # print('[app][train] g_train', g_train) if not os.path.isdir(self.odir): os.makedirs(self.odir) save_pickle(g_train, os.path.join(self.odir, 'train')) save_pickle(l_train, os.path.join(self.odir, 'train_labels')) save_pickle(g_test, os.path.join(self.odir, 'test')) save_pickle(l_test, os.path.join(self.odir, 'test_labels')) # save graph name list to txt file save_txt(n_train, os.path.join(self.odir, 'train_list.txt')) save_txt(n_test, os.path.join(self.odir, 'test_list.txt')) K = k_fold for k in range(K): self.model = self.ModelObj(g=self.data_graph[0], config_params=self.model_config, n_classes=self.data_nclasses, n_rels=self.data_nrels, n_entities=self.data_nentities, is_cuda=self.is_cuda, batch_size=1, model_src_path=self.model_src_path) print('*** [app][__init__] Model layers ***') for name, param in self.model.named_parameters(): if param.requires_grad: print('\t', name, param.data.type()) print('>>> [app][__init__] self.model.fc.weight.type', self.model.fc.weight.type()) optimizer = torch.optim.Adam( self.model.parameters(), lr=self.learning_config['lr'], weight_decay=self.learning_config['weight_decay']) start = int(len(g_train) / K) * k end = int(len(g_train) / K) * (k + 1) print('\n\n\n[app][train] Process new k=' + str(k) + ' | ' + str(start) + '-' + str(end)) # training batch train_batch_graphs = g_train[:start] + g_train[end:] train_batch_labels = l_train[list(range(0, start)) + list(range(end + 1, len(g_train)))] train_batch_samples = list( map(list, zip(train_batch_graphs, train_batch_labels))) train_batches = DataLoader( train_batch_samples, batch_size=self.learning_config['batch_size'], shuffle=True, collate_fn=collate) # testing batch val_batch_graphs = g_train[start:end] val_batch_labels = l_train[start:end] # print('[app][train] val_batch_graphs', val_batch_graphs) print('[app][train] len val_batch_graphs', len(val_batch_graphs)) print('[app][train] val_batch_graphs[0].number_of_nodes()', val_batch_graphs[0].number_of_nodes()) print('[app][train] val_batch_graphs[-1].number_of_nodes()', val_batch_graphs[-1].number_of_nodes()) val_batch = dgl.batch(val_batch_graphs) print('[app][train] train_batches size: ', len(train_batches)) print('[app][train] train_batch_graphs size: ', len(train_batch_graphs)) print('[app][train] val_batch_graphs size: ', len(val_batch_graphs)) print('[app][train] train_batches', train_batches) print('[app][train] val_batch_labels', val_batch_labels) dur = [] for epoch in range(self.learning_config['epochs']): self.model.train() if epoch >= 3: t0 = time.time() losses = [] training_accuracies = [] for iter_idx, (bg, label) in enumerate(train_batches): # print('~~~ [app][train] bg', bg) logits = self.model(bg) if self.learning_config['cuda']: label = label.cuda() loss = loss_fcn(logits, label) losses.append(loss.item()) _, indices = torch.max(logits, dim=1) # print('~~~~ logits', logits) # print('------------------') print('\t [app][train] indices', indices) # print('\t label', label) correct = torch.sum(indices == label) training_accuracies.append(correct.item() * 1.0 / len(label)) optimizer.zero_grad() loss.backward(retain_graph=True) # loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss, _ = self.model.eval_graph_classification( val_batch_labels, val_batch) print( "[app][train] Epoch {:05d} | Time(s) {:.4f} | train_acc {:.4f} | train_loss {:.4f} | val_acc {:.4f} | val_loss {:.4f}" .format(epoch, np.mean(dur) if dur else 0, np.mean(training_accuracies), np.mean(losses), val_acc, val_loss)) is_better = self.early_stopping(val_loss, self.model, save_path) if is_better: self.accuracies[k] = val_acc if self.early_stopping.early_stop: # Print model's state_dict # print("*** Model's state_dict:") # for param_tensor in self.model.state_dict(): # print(param_tensor, "\t", self.model.state_dict()[param_tensor].size()) # # Print optimizer's state_dict # print("*** Optimizer's state_dict:") # for var_name in optimizer.state_dict(): # print(var_name, "\t", optimizer.state_dict()[var_name]) # Save state dict # torch.save(self.model.state_dict(), save_dir+'/model_state.pt') # Save model # torch.save({ # 'epoch': epoch, # 'model_state_dict': self.model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), # 'val_loss': val_loss, # }, save_dir+'/saved') print("[app][train] Early stopping") break self.early_stopping.reset() def test(self, model_path=''): print('[app][test] Test model') try: print('*** [app][test] Load pre-trained model ' + model_path + ' ***') self.model = load_checkpoint(self.model, model_path, self.is_cuda) except ValueError as e: print('[app][test] Error while loading the model.', e) self.save_traintest() # print('\n[app][test] Test all') # # acc = np.mean(self.accuracies) # # acc = self.accuracies # graphs = self.data[GRAPH] # labels = self.labels # self.run_test(graphs, labels) graphs = load_pickle(os.path.join(self.odir, 'train')) labels = load_pickle(os.path.join(self.odir, 'train_labels')) print('\n[app][test] Test on train graphs ({})'.format(len(labels)), os.path.join(self.odir, 'train')) self.run_test_fold(graphs, labels, fold=300) graphs = load_pickle(os.path.join(self.odir, 'test')) labels = load_pickle(os.path.join(self.odir, 'test_labels')) print('\n[app][test] Test on test graphs ({})'.format(len(labels)), os.path.join(self.odir, 'test')) self.run_test_fold(graphs, labels, fold=150) def test_on_data(self, model_path=''): print('[app][test_on_data] Test model') try: print('*** [app][test_on_data] Load pre-trained model ' + model_path + ' ***') self.model = load_checkpoint(self.model, model_path, self.is_cuda) except ValueError as e: print('Error while loading the model.', e) print('\n[app][test_on_data] Test on data') # acc = np.mean(self.accuracies) # acc = self.accuracies graphs = self.data[GRAPH] labels = self.labels self.run_test(graphs, labels) # batch_size = 1024 # batch_num = len(graphs) // batch_size # print('batch_num', batch_num) # for batch in range(batch_num): # start = (batch)*batch_size # end = (batch+1)*batch_size # graphs = graphs[start:end] # print(batch, len(graphs)) # self.run_test(graphs, labels) def save_traintest(self): graphs = self.data[GRAPH] # load all the graphs # labels = self.labels # graphs_names = self.graphs_names # debug purposes: reshuffle all the data before the splitting random_indices = list(range(len(graphs))) random.shuffle(random_indices) graphs = [graphs[i] for i in random_indices] labels = self.labels[random_indices] graphs_names = [self.graphs_names[i] for i in random_indices] if True: train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/train_list.txt' test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/test_list.txt' train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_train_list.txt' test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_test_list.txt' train_files = [] test_files = [] g_train = [] l_train = [] n_train = [] g_test = [] l_test = [] n_test = [] with open(train_list_file, 'r') as f: train_files = [l.strip() for l in f.readlines()] with open(test_list_file, 'r') as f: test_files = [l.strip() for l in f.readlines()] for i in range(len(labels)): graph_jsonpath = graphs_names[i] # print(graph_jsonpath) if graph_jsonpath in train_files: g_train.append(graphs[i]) l_train.append(labels[i]) n_train.append(graphs_names[i]) if graph_jsonpath in test_files: g_test.append(graphs[i]) l_test.append(labels[i]) n_test.append(graphs_names[i]) l_train = torch.Tensor(l_train).type(torch.LongTensor) l_test = torch.Tensor(l_test).type(torch.LongTensor) if self.is_cuda is True: l_train = l_train.cuda() l_test = l_test.cuda() print('[app][save_traintest] len labels', len(labels)) print('[app][save_traintest] len l_test', len(l_test)) print('[app][save_traintest] len l_train', len(l_train)) tot_bgn = (labels == self.mapping['benign']).sum().item() tot_mal = (labels == self.mapping['malware']).sum().item() print('[app][save_traintest] tot_bgn', tot_bgn, 'tot_mal', tot_mal) if not os.path.isdir(self.odir): os.makedirs(self.odir) save_pickle(g_train, os.path.join(self.odir, 'train')) save_pickle(l_train, os.path.join(self.odir, 'train_labels')) save_pickle(g_test, os.path.join(self.odir, 'test')) save_pickle(l_test, os.path.join(self.odir, 'test_labels')) def run_test_fold(self, graphs, labels, fold=5): num_g = len(labels) num_g_per_fold = num_g / fold cm_all = np.zeros((len(self.mapping), len(self.mapping))) # tot_far = 0 # tot_tpr = 0 for i in range(fold): start_idx = int(i * num_g_per_fold) end_idx = int((i + 1) * num_g_per_fold) print('* [app][test] Test from {} to {} (total={})'.format( start_idx, end_idx, end_idx - start_idx)) G = graphs[start_idx:end_idx] lbls = labels[start_idx:end_idx] acc, cm = self.run_test(G, lbls) # print('\t ~~ cm', cm) cm_all += cm - np.array([[1, 0], [0, 1]]) # if cm.shape[0] == 2: # tot_far += cm[lbl_bng][lbl_mal] print(' >> [app][run_test] All FOLD: cm_all', cm_all) if len(self.mapping) == 2: labels_cpu = labels.cpu() lbl_mal = self.mapping['malware'] lbl_bng = self.mapping['benign'] n_mal = (labels_cpu == lbl_mal).sum().item() n_bgn = (labels_cpu == lbl_bng).sum().item() tpr = (cm_all[lbl_mal][lbl_mal] / n_mal * 100).item( ) # actual malware that is correctly detected as malware far = (cm_all[lbl_bng][lbl_mal] / n_bgn * 100).item() # benign that is incorrectly labeled as malware print(' >> [app][run_test] All FOLD: TPR', tpr, 'n_mal', n_mal, ' || FAR', far, 'n_bgn', n_bgn) total_samples = len(labels) total_correct = cm_all[lbl_mal][lbl_mal] + cm_all[lbl_bng][lbl_bng] acc_all = (total_correct / total_samples * 100).item() print(' >> [app][run_test] All FOLD: Acc', acc_all, ' Total samples', total_samples) def run_test(self, graphs, labels): batches = dgl.batch(graphs) acc, _, logits = self.model.eval_graph_classification(labels, batches) _, indices = torch.max(logits, dim=1) labels_cpu = labels.cpu() indices_cpu = indices.cpu() # print('\t [run_test] labels', labels) # print('\t [run_test] indices', indices) # labels_txt = ['malware', 'benign'] # print('\t [app][run_test] Total samples', len(labels_cpu)) # prepend this to make sure cm shape is always (2,2) labels_cpu = torch.cat((labels_cpu, torch.tensor([0, 1])), 0) indices_cpu = torch.cat((indices_cpu, torch.tensor([0, 1])), 0) cm = confusion_matrix(y_true=labels_cpu, y_pred=indices_cpu) C = cm / cm.astype(np.float).sum(axis=1) # print('\t [app][run_test] confusion_matrix:', cm) # if len(self.mapping) == 2: # lbl_mal = self.mapping['malware'] # lbl_bng = self.mapping['benign'] # n_mal = (labels_cpu == lbl_mal).sum().item() # n_bgn = (labels_cpu == lbl_bng).sum().item() # tpr = cm[lbl_mal][lbl_mal]/n_mal * 100 # actual malware that is correctly detected as malware # far = cm[lbl_bng][lbl_mal]/n_bgn * 100 # benign that is incorrectly labeled as malware # print('\t [app][run_test] TPR', tpr, ' || FAR', far, 'n_bgn', n_bgn) # # print('\t [app][run_test] FAR', far, 'n_bgn', n_bgn) # fig = plt.figure() # ax = fig.add_subplot(111) # cax = ax.matshow(cm) # plt.title('Confusion matrix of the classifier') # fig.colorbar(cax) # # ax.set_xticklabels([''] + labels) # # ax.set_yticklabels([''] + labels) # plt.xlabel('Predicted') # plt.ylabel('True') # plt.show() print("\t [app][run_test] Accuracy {:.4f}".format(acc)) # acc = np.mean(self.accuracies) return acc, cm
def _train_net(subject, model, train_loader, val_loader, loss_function, optimizer, scheduler=None, epochs=500, early_stopping=True, plot=True, track_lr=True, pbar=None): """ Main training loop Parameters: - subject: Integer, subject ID - model: t.nn.Module (is set to training mode) - train_loader: t.utils.data.DataLoader: training data - val_loader: t.utils.data.DataLoader: validation data - loss_function: function - optimizer: t.optim.Optimizer - scheduler: t.optim.lr_scheduler or None - epochs: Integer, number of epochs to train - early_stopping: boolean, if True, store models for all epochs and select the one with the highest validation accuracy - plot: boolean, if True, generate all plots and store on disk - pbar: tqdm progress bar or None, in which case no progress will be displayed (not closed afterwards) Returns: (model, metrics, epoch, history) - model: t.nn.Module, trained model - metrics: t.tensor, size=[1, 4], accuracy, precision, recall, f1 - epoch: integer, always equal to 500 if early stopping is not used - history: tuple: (loss, accuracy), where both are t.tensor, size=[2, epochs] Notes: - Model and data will not be moved to gpu, do this outside of this function. - When early_stopping is enabled, this function will store all intermediate models """ # prepare result loss = t.zeros((2, epochs)) accuracy = t.zeros((2, epochs)) lr = None if track_lr: lr = t.zeros((epochs)) # prepare early_stopping if early_stopping: early_stopping = EarlyStopping() use_cuda = model.is_cuda() # train model for all epochs for epoch in range(epochs): # train the model train_loss, train_accuracy = _train_epoch(model, train_loader, loss_function, optimizer, scheduler=scheduler, use_cuda=use_cuda) # collect current loss and accuracy validation_loss, validation_accuracy = _test_net(model, val_loader, loss_function, train=False, use_cuda=use_cuda) loss[0, epoch] = train_loss loss[1, epoch] = validation_loss accuracy[0, epoch] = train_accuracy accuracy[1, epoch] = validation_accuracy if track_lr: lr[epoch] = optimizer.param_groups[0]['lr'] # do early stopping if early_stopping: early_stopping.checkpoint(model, loss[1, epoch], accuracy[1, epoch], epoch) if pbar is not None: pbar.update() # get the best model if early_stopping: model, best_loss, best_accuracy, best_epoch = early_stopping.use_best_model( model) else: best_epoch = epoch # generate plots if plot: generate_plots(subject, model, val_loader, loss, accuracy, lr=lr) metrics = get_metrics_from_model(model, val_loader) return model, metrics, best_epoch + 1, (loss, accuracy)
def train(self, data_train, data_valid, enable_es=1): with tf.Session(graph=self.graph) as session: tf.set_random_seed(1234) logger = Logger(session, self.summary_dir) # here you initialize the tensorflow saver that will be used in saving the checkpoints. # max_to_keep: defaults to keeping the 5 most recent checkpoints of your model saver = tf.train.Saver() self.session = session early_stopping = EarlyStopping(name='total loss', decay_fn=self.decay_fn) if (self.restore and self.load(session, saver)): num_epochs_trained = self.model_graph.cur_epoch_tensor.eval( session) print('EPOCHS trained: ', num_epochs_trained) else: print('Initizalizing Variables ...') tf.global_variables_initializer().run() if (self.model_graph.cur_epoch_tensor.eval(session) == self.epochs ): return for cur_epoch in range( self.model_graph.cur_epoch_tensor.eval(session), self.epochs + 1, 1): print('EPOCH: ', cur_epoch) self.current_epoch = cur_epoch loss_tr, recons_tr, L2_loss = self.train_epoch( session, logger, data_train) if da.isnan(loss_tr): print( 'Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.' ) print('Recons: ', recons_tr) sys.exit() loss_val, recons_val = self.valid_epoch( session, logger, data_valid) print('TRAIN | AE Loss: ', loss_tr, ' | Recons: ', recons_tr, ' | L2_loss: ', L2_loss) print('VALID | AE Loss: ', loss_val, ' | Recons: ', recons_val) if (cur_epoch == 1) or ((cur_epoch % const.SAVE_EPOCH == 0) and ((cur_epoch != 0))): self.save( session, saver, self.model_graph.global_step_tensor.eval(session)) if self.plot: self.generate_samples(data_train, session, cur_epoch) if self.clustering: self.generate_clusters(logger, cur_epoch, data_train, data_valid) session.run(self.model_graph.increment_cur_epoch_tensor) #Early stopping if (enable_es == 1 and early_stopping.stop(loss_val)): print('Early Stopping!') break if cur_epoch % 50 == 0: if self.colab: self.push_colab() self.save(session, saver, self.model_graph.global_step_tensor.eval(session)) if self.plot: self.generate_samples(data_train, session, cur_epoch) if self.clustering: self.generate_clusters(logger, cur_epoch, data_train, data_valid) if self.colab: self.push_colab() return
def sent_clf(dataset, config, opts, transfer=False): from logger.experiment import Experiment opts.name = config["name"] X_train, y_train, _, X_val, y_val, _ = dataset vocab = None if transfer: opts.transfer = config["pretrained_lm"] checkpoint = load_checkpoint(opts.transfer) config["vocab"].update(checkpoint["config"]["vocab"]) dict_pattern_rename(checkpoint["config"]["model"], {"rnn_": "bottom_rnn_"}) config["model"].update(checkpoint["config"]["model"]) vocab = checkpoint["vocab"] #################################################################### # Load Preprocessed Datasets #################################################################### if config["preprocessor"] == "twitter": preprocessor = twitter_preprocessor() else: preprocessor = None print("Building training dataset...") train_set = ClfDataset(X_train, y_train, vocab=vocab, preprocess=preprocessor, vocab_size=config["vocab"]["size"], seq_len=config["data"]["seq_len"]) print("Building validation dataset...") val_set = ClfDataset(X_val, y_val, seq_len=train_set.seq_len, preprocess=preprocessor, vocab=train_set.vocab) src_lengths = [len(x) for x in train_set.data] val_lengths = [len(x) for x in val_set.data] # select sampler & dataloader train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True) val_sampler = SortedSampler(val_lengths) val_sampler_train = SortedSampler(src_lengths) train_loader = DataLoader(train_set, batch_sampler=train_sampler, num_workers=opts.cores, collate_fn=ClfCollate()) val_loader = DataLoader(val_set, sampler=val_sampler, batch_size=config["batch_size"], num_workers=opts.cores, collate_fn=ClfCollate()) val_loader_train_dataset = DataLoader(train_set, sampler=val_sampler_train, batch_size=config["batch_size"], num_workers=opts.cores, collate_fn=ClfCollate()) #################################################################### # Model #################################################################### ntokens = len(train_set.vocab) model = Classifier(ntokens, len(set(train_set.labels)), **config["model"]) model.to(opts.device) clf_criterion = nn.CrossEntropyLoss() lm_criterion = nn.CrossEntropyLoss(ignore_index=0) embed_parameters = filter(lambda p: p.requires_grad, model.embed.parameters()) bottom_parameters = filter( lambda p: p.requires_grad, chain(model.bottom_rnn.parameters(), model.vocab.parameters())) if config["model"]["has_att"]: top_parameters = filter( lambda p: p.requires_grad, chain(model.top_rnn.parameters(), model.attention.parameters(), model.classes.parameters())) else: top_parameters = filter( lambda p: p.requires_grad, chain(model.top_rnn.parameters(), model.classes.parameters())) embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001) rnn_optimizer = optim.ASGD(bottom_parameters) top_optimizer = Adam(top_parameters, lr=config["top_lr"]) #################################################################### # Training Pipeline #################################################################### # Trainer: responsible for managing the training process trainer = SentClfTrainer(model, train_loader, val_loader, (lm_criterion, clf_criterion), [embed_optimizer, rnn_optimizer, top_optimizer], config, opts.device, valid_loader_train_set=val_loader_train_dataset, unfreeze_embed=config["unfreeze_embed"], unfreeze_rnn=config["unfreeze_rnn"]) #################################################################### # Experiment: logging and visualizing the training process #################################################################### # exp = Experiment(opts.name, config, src_dirs=opts.source, # output_dir=EXP_DIR) # exp.add_metric("ep_loss_lm", "line", "epoch loss lm", # ["TRAIN", "VAL"]) # exp.add_metric("ep_loss_cls", "line", "epoch loss class", # ["TRAIN", "VAL"]) # exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"]) # exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"]) # # exp.add_value("epoch", title="epoch summary") # exp.add_value("progress", title="training progress") ep_loss_lm = [10000, 10000] ep_loss_cls = [10000, 10000] ep_f1 = [0, 0] ep_acc = [0, 0] e_log = 0 progress = 0 #################################################################### # Resume Training from a previous checkpoint #################################################################### if transfer: print("Transferring Encoder weights ...") dict_pattern_rename(checkpoint["model"], { "encoder": "bottom_rnn", "decoder": "vocab" }) load_state_dict_subset(model, checkpoint["model"]) print(model) #################################################################### # Training Loop #################################################################### best_loss = None early_stopping = EarlyStopping("min", config["patience"]) for epoch in range(0, config["epochs"]): train_loss = trainer.train_epoch() val_loss, y, y_pred = trainer.eval_epoch(val_set=True) _, y_train, y_pred_train = trainer.eval_epoch(train_set=True) # exp.update_metric("ep_loss_lm", train_loss[0], "TRAIN") ep_loss_lm[0] = train_loss[0] # exp.update_metric("ep_loss_lm", val_loss[0], "VAL") ep_loss_lm[1] = val_loss[0] # exp.update_metric("ep_loss_cls", train_loss[1], "TRAIN") # exp.update_metric("ep_loss_cls", val_loss[1], "VAL") ep_loss_cls[0] = train_loss[1] ep_loss_cls[1] = val_loss[1] # exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train), # "TRAIN") ep_f1[0] = f1_macro(y_train, y_pred_train) # exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL") ep_f1[1] = f1_macro(y, y_pred) # exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN") # exp.update_metric("ep_acc", acc(y, y_pred), "VAL") ep_acc[0] = acc(y_train, y_pred_train) ep_acc[1] = acc(y, y_pred) # print('Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format( # ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1], ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1] # )) # epoch_log = exp.log_metrics(["ep_loss_lm", "ep_loss_cls","ep_f1", "ep_acc"]) epoch_log = 'Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format( ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1], ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1]) print(epoch_log) # exp.update_value("epoch", epoch_log) e_log = epoch_log # print('') # Save the model if the val loss is the best we've seen so far. # if not best_loss or val_loss[1] < best_loss: # best_loss = val_loss[1] # trainer.best_acc = acc(y, y_pred) # trainer.best_f1 = f1_macro(y, y_pred) # trainer.checkpoint(name=opts.name, timestamp=True) best_loss = val_loss[1] trainer.best_acc = acc(y, y_pred) trainer.best_f1 = f1_macro(y, y_pred) trainer.checkpoint(name=opts.name, tags=str(epoch)) # if early_stopping.stop(val_loss[1]): # print("Early Stopping (according to classification loss)....") # break print("\n" * 2) return best_loss, trainer.best_acc, trainer.best_f1
def train(params, m, datas): # early stopping es = EarlyStopping(mode = 'max', patience = params.cldc_patience) # set optimizer optimizer = get_optimizer(params, m) # get initial parameters if params.zs_reg_alpha > 0: init_param_dict = {k: v.detach().clone() for k, v in m.named_parameters() if v.requires_grad} # training train_lang, train_data = get_lang_data(params, datas, training = True) # dev & test are in the same lang test_lang, test_data = get_lang_data(params, datas) n_batch = train_data.train_size // params.cldc_bs if train_data.train_size % params.cldc_bs == 0 else train_data.train_size // params.cldc_bs + 1 # get the same n_batch for unlabelled data as well # batch size for unlabelled data rest_cldc_bs = train_data.rest_train_size // n_batch # per category data_idxs = [list(range(len(train_idx))) for train_idx in train_data.train_idxs] rest_data_idxs = list(range(len(train_data.rest_train_idxs))) # number of iterations cur_it = 0 # write to tensorboard writer = SummaryWriter('./history/{}'.format(params.log_path)) if params.write_tfboard else None # best dev/test bdev = 0 btest = 0 # current dev/test cdev = 0 ctest = 0 dev_class_acc = {} test_class_acc = {} dev_cm = None test_cm = None # early stopping warm up flag, start es after train loss below some threshold es_flag = False # set io function out_semicldc = getattr(ios, 'out_semicldc_{}'.format(params.cldc_train_mode)) for i in range(params.cldc_ep): for data_idx in data_idxs: shuffle(data_idx) shuffle(rest_data_idxs) for j in range(n_batch): train_idxs = [] for k, data_idx in enumerate(data_idxs): if j < n_batch - 1: train_idxs.append(data_idx[int(j * params.cldc_bs * train_data.train_prop[k]): int((j + 1) * params.cldc_bs * train_data.train_prop[k])]) rest_train_idxs = rest_data_idxs[j * rest_cldc_bs: (j + 1) * rest_cldc_bs] elif j == n_batch - 1: train_idxs.append(data_idx[int(j * params.cldc_bs * train_data.train_prop[k]):]) rest_train_idxs = rest_data_idxs[j * rest_cldc_bs:] # get batch data batch_train, batch_train_lens, batch_train_lb, batch_train_ohlb = get_batch(params, train_idxs, train_data.train_idxs, train_data.train_lens) batch_rest_train, batch_rest_train_lens, batch_rest_train_lb, batch_rest_train_ohlb = get_rest_batch(params, rest_train_idxs, train_data.rest_train_idxs, train_data.rest_train_lens, enumerate_discrete) optimizer.zero_grad() m.train() if i + 1 <= params.cldc_warm_up_ep: m.warm_up = True else: m.warm_up = False loss_dict, batch_pred = m(train_lang, batch_train, batch_train_lens, batch_train_lb, batch_train_ohlb, batch_rest_train, batch_rest_train_lens, batch_rest_train_lb, batch_rest_train_ohlb) # regularization term if params.zs_reg_alpha > 0: reg_loss = .0 for k, v in m.named_parameters(): if k in init_param_dict and v.requires_grad: reg_loss += torch.sum((v - init_param_dict[k]) ** 2) print(reg_loss.detach()) reg_loss *= params.zs_reg_alpha / 2 reg_loss.backward() batch_acc, batch_acc_cls = get_classification_report(params, batch_train_lb.data.cpu().numpy(), batch_pred.data.cpu().numpy()) if loss_dict['L_cldc_loss'] < params.cldc_lossth: es_flag = True #loss_dict['total_loss'].backward() out_semicldc(i, j, n_batch, loss_dict, batch_acc, batch_acc_cls, bdev, btest, cdev, ctest, es.num_bad_epochs) #torch.nn.utils.clip_grad_norm_(filter(lambda p: p.grad is not None and p.requires_grad, m.parameters()), 5) ''' # debug for gradient for p_name, p in m.named_parameters(): if p.grad is not None and p.requires_grad: print(p_name, p.grad.data.norm(2).item()) ''' optimizer.step() cur_it += 1 update_tensorboard(params, writer, loss_dict, batch_acc, cdev, ctest, dev_class_acc, test_class_acc, cur_it) if cur_it % params.CLDC_VAL_EVERY == 0: sys.stdout.write('\n') sys.stdout.flush() # validation cdev, dev_class_acc, dev_cm = test(params, m, test_data.dev_idxs, test_data.dev_lens, test_data.dev_size, test_data.dev_prop, test_lang, cm = True) ctest, test_class_acc, test_cm = test(params, m, test_data.test_idxs, test_data.test_lens, test_data.test_size, test_data.test_prop, test_lang, cm = True) print(dev_cm) print(test_cm) if es.step(cdev): print('\nEarly Stoped.') # vis #if params.cldc_visualize: #tsne2d(params, m) # vis return elif es.is_better(cdev, bdev): bdev = cdev btest = ctest #save_model(params, m) # reset bad epochs if not es_flag: es.num_bad_epochs = 0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test): num_labels = NUM_EMO vocab_size = VOCAB_SIZE print('NUM of VOCAB' + str(vocab_size)) train_data = EmotionDataLoader(X_train, y_train, PAD_LEN) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN) dev_loader = DataLoader(dev_data, batch_size=int(BATCH_SIZE / 3) + 2, shuffle=False) test_data = EmotionDataLoader(X_test, y_test, PAD_LEN) test_loader = DataLoader(test_data, batch_size=int(BATCH_SIZE / 3) + 2, shuffle=False) model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, vocab_size, num_labels, BATCH_SIZE, att_mode=opt.attention, soft_last=False, use_glove=USE_GLOVE, add_linear=ADD_LINEAR, max_pool=MAX_POOLING) if USE_GLOVE: model.load_embedding(tokenizer.get_embeddings()) # multi-GPU # model = nn.DataParallel(model) model.cuda() if opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss() # print('Using ce loss') elif opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=True) print('Using focal loss, gamma=', opt.focal) else: raise Exception('loss option not recognised') optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) es = EarlyStopping(patience=PATIENCE) old_model = None for epoch in range(1, 300): print('Epoch: ' + str(epoch) + '===================================') train_loss = 0 model.train() for i, (data, seq_len, label) in tqdm(enumerate(train_loader), total=len(train_data) / BATCH_SIZE): optimizer.zero_grad() data_text = [tokenizer.decode_ids(x) for x in data] with torch.no_grad(): character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(data.cuda(), seq_len, elmo_emb, emoji_encoding.cuda()) loss = loss_criterion(y_pred, label.view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS) optimizer.step() train_loss += loss.data.cpu().numpy() * data.shape[0] del y_pred, loss test_loss = 0 model.eval() for _, (_data, _seq_len, _label) in enumerate(dev_loader): with torch.no_grad(): data_text = [tokenizer.decode_ids(x) for x in _data] character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(_data.cuda(), _seq_len, elmo_emb, emoji_encoding.cuda()) loss = loss_criterion(y_pred, _label.view(-1).cuda()) test_loss += loss.data.cpu().numpy() * _data.shape[0] del y_pred, loss print("Train Loss: " + str(train_loss / len(train_data)) + \ " Evaluation: " + str(test_loss / len(dev_data))) if es.step(test_loss): # overfitting del model print('overfitting, loading best model ...') model = old_model break else: if es.is_best(): if old_model is not None: del old_model print('saving best model ...') old_model = deepcopy(model) else: print('not best model, ignoring ...') if old_model is None: old_model = deepcopy(model) with open(f'lstm_elmo_deepmoji_{opt.dataset}_model.pt', 'bw') as f: torch.save(model.state_dict(), f) pred_list = [] model.eval() for _, (_data, _seq_len, _label) in enumerate(test_loader): with torch.no_grad(): data_text = [tokenizer.decode_ids(x) for x in _data] character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(_data.cuda(), _seq_len, elmo_emb, emoji_encoding.cuda()) pred_list.append( y_pred.data.cpu().numpy()) # x[np.where( x > 3.0 )] del y_pred pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) return pred_list
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN) train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) gradient_accumulation_steps = 1 num_train_steps = int( len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps * MAX_EPOCH) pred_list_test_best = None final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False model = BERT_classifer.from_pretrained(BERT_MODEL) model.add_output_layer(BERT_MODEL, NUM_EMO) model = nn.DataParallel(model) model.cuda() # BERT optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=0.1, t_total=num_train_steps) if opt.w == 1: weight_list = [0.3, 0.3, 0.3, 1.7] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] elif opt.w == 2: weight_list = [ 0.3198680179, 0.246494733, 0.2484349259, 1.74527696 ] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] weight_list = [x**FLAT for x in weight_list] weight_label = torch.Tensor(weight_list).cuda() weight_list_binary = [x**FLAT for x in weight_list_binary] weight_binary = torch.Tensor(weight_list_binary).cuda() print('binary loss reweight = weight_list_binary', weight_list_binary) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=False) loss_criterion_binary = FocalLoss(gamma=opt.focal, reduce=False) # elif opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss(reduce=False) loss_criterion_binary = nn.CrossEntropyLoss(reduce=False) # loss_criterion_emo_only = nn.MSELoss() # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE) es = EarlyStopping(patience=EARLY_STOP_PATIENCE) final_pred_best = None final_pred_list_test = None pred_list_test = None for num_epoch in range(MAX_EPOCH): print('Begin training epoch:', num_epoch) sys.stdout.flush() train_loss = 0 model.train() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in tqdm(enumerate(train_data_loader), total=len(train_data_set) / BATCH_SIZE): optimizer.zero_grad() if USE_TOKEN_TYPE: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda()) loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) # training trilogy loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss.data.cpu().numpy() * tokens.shape[0] del loss, pred # Evaluate model.eval() dev_loss = 0 # pred_list = [] # gold_list = [] for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(dev_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda()) loss_label = loss_criterion( pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only( pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) dev_loss += loss.data.cpu().numpy() * tokens.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) # gold_list = np.concatenate(gold_list, axis=0) print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) # checking diverge if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') if num_epoch == 1: is_diverged = True final_pred_best = deepcopy(final_pred_list_test) pred_list_test_best = deepcopy(pred_list_test) break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is not None: del pred_list_test_best pred_list_test_best = deepcopy(pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is None: pred_list_test_best = deepcopy(pred_list_test) print('Gold Dev ...') pred_list_test = [] model.eval() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_dev_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, _, _ = model(tokens.cuda(), masks.cuda()) pred_list_test.append(pred.data.cpu().numpy()) pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test) print('Gold Test ...') final_pred_list_test = [] model.eval() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_test_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, _, _ = model(tokens.cuda(), masks.cuda()) final_pred_list_test.append(pred.data.cpu().numpy()) final_pred_list_test = np.argmax(np.concatenate( final_pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test) if is_diverged: print("Reinitialize model ...") del model continue all_fold_results.append(pred_list_test_best) real_test_results.append(final_pred_best) del model break
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader # for one fold, test data comes from k fold split. train_data_set = TrainDataSet(X_train, y_train, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_set = TrainDataSet(X_dev, y_dev, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False # Model is defined in HierarchicalPredictor model = HierarchicalAttPredictor(SENT_EMB_DIM, SENT_HIDDEN_SIZE, CTX_LSTM_DIM, num_of_vocab, SENT_PAD_LEN, id2word, USE_ELMO=True, ADD_LINEAR=False) model.load_embedding(emb) model.deepmoji_model.load_specific_weights( PRETRAINED_PATH, exclude_names=['output_layer']) model.cuda() # model = nn.DataParallel(model) # model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) # # optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal) elif opt.loss == 'ce': loss_criterion = nn.BCELoss() es = EarlyStopping(patience=EARLY_STOP_PATIENCE) final_pred_list_test = None result_print = {} for num_epoch in range(MAX_EPOCH): # to ensure shuffle at ever epoch train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) print('Begin training epoch:', num_epoch, end='...\t') sys.stdout.flush() # stepping scheduler scheduler.step(num_epoch) print('Current learning rate', scheduler.get_lr()) ## Training step train_loss = 0 model.train() for i, (a, a_len, emoji_a, e_c) \ in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE): optimizer.zero_grad() e_c = e_c.type(torch.float) pred = model(a.cuda(), a_len, emoji_a.cuda()) loss_label = loss_criterion(pred.squeeze(1), e_c.view(-1).cuda()).cuda() # training trilogy loss_label.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss_label.data.cpu().numpy() * a.shape[0] del pred, loss_label ## Evaluatation step model.eval() dev_loss = 0 # pred_list = [] for i, (a, a_len, emoji_a, e_c) in enumerate(dev_data_loader): with torch.no_grad(): e_c = e_c.type(torch.float) pred = model(a.cuda(), a_len, emoji_a.cuda()) loss_label = loss_criterion( pred.squeeze(1), e_c.view(-1).cuda()).cuda() dev_loss += loss_label.data.cpu().numpy() * a.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss_label print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) # Gold Test testing print('Final test testing...') final_pred_list_test = [] model.eval() for i, (a, a_len, emoji_a) in enumerate(final_test_data_loader): with torch.no_grad(): pred = model(a.cuda(), a_len, emoji_a.cuda()) final_pred_list_test.append(pred.data.cpu().numpy()) del a, pred print("final_pred_list_test", len(final_pred_list_test)) final_pred_list_test = np.concatenate(final_pred_list_test, axis=0) final_pred_list_test = np.squeeze(final_pred_list_test, axis=1) print("final_pred_list_test_concat", len(final_pred_list_test)) accuracy, precision, recall, f1 = get_metrics( np.asarray(final_test_target_list), np.asarray(final_pred_list_test)) result_print.update( {num_epoch: [accuracy, precision, recall, f1]}) if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) with open(result_path, 'wb') as w: pkl.dump(result_print, w) if is_diverged: print("Reinitialize model ...") del model continue real_test_results.append(np.asarray(final_pred_best)) # saving model for inference torch.save(model.state_dict(), opt.out_path) del model break
def train(pairs_batch_train, pairs_batch_dev, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, num_epochs, device): clip = 5.0 tf_rate = 1 early_stopping = EarlyStopping(patience=15, verbose=False, delta=0) for epoch in range(10): encoder.train() decoder.train() for _, batch in enumerate(pairs_batch_train): pad_input_seqs, input_seq_lengths, pad_target_seqs, pad_target_seqs_lengths = batch pad_input_seqs, pad_target_seqs = pad_input_seqs.to( device), pad_target_seqs.to(device) train_loss = 0 encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() encoder_output, encoder_hidden = encoder(pad_input_seqs, input_seq_lengths) decoder_input = torch.ones(batch_size, 1).long().to(device) decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True), encoder_hidden[1].sum(0, keepdim=True)) teacher_forcing = True if random.random() <= tf_rate else False if teacher_forcing: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_output) target = pad_target_seqs.squeeze() train_loss += criterion(decoder_output, target[i]) decoder_input = pad_target_seqs[i] else: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_output) _, topi = decoder_output.topk(1) target = pad_target_seqs.squeeze() train_loss += criterion(decoder_output, target[i]) decoder_input = topi.detach() train_loss.backward() torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip) torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip) encoder_optimizer.step() decoder_optimizer.step() # CALCULATE EVALUATION with torch.no_grad(): for _, batch in enumerate(pairs_batch_dev): encoder.eval() decoder.eval() pad_input_seqs, input_seq_lengths, pad_target_seqs, pad_target_seqs_lengths = batch pad_input_seqs, pad_target_seqs = pad_input_seqs.to( device), pad_target_seqs.to(device) dev_loss = 0 encoder_output, encoder_hidden = encoder( pad_input_seqs, input_seq_lengths) decoder_input = torch.ones(batch_size, 1).long().to(device) decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True), encoder_hidden[1].sum(0, keepdim=True)) teacher_forcing = True if random.random() <= tf_rate else False if teacher_forcing: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_output) target = pad_target_seqs.squeeze() dev_loss += criterion(decoder_output, target[i]) decoder_input = pad_target_seqs[i] else: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_output) _, topi = decoder_output.topk(1) target = pad_target_seqs.squeeze() dev_loss += criterion(decoder_output, target[i]) decoder_input = topi.detach() #early_stopping(complete_loss_dev, (encoder, decoder, encoder_optimizer, decoder_optimizer)) #if early_stopping.early_stop: # print('Early stopping') # break print('[Epoch: %d] train_loss: %.4f val_loss: %.4f' % (epoch + 1, train_loss.item(), dev_loss.item()))
def train(X_train, y_train, X_dev, y_dev, X_test, y_test): train_set = TrainDataReader(X_train, y_train, MAX_LEN_DATA) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) dev_set = TrainDataReader(X_dev, y_dev, MAX_LEN_DATA) dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE * 3, shuffle=False) test_set = TestDataReader(X_test, MAX_LEN_DATA) test_loader = DataLoader(test_set, batch_size=BATCH_SIZE * 3, shuffle=False) # Model initialize model = BinaryLSTMClassifier( emb_dim=SRC_EMB_DIM, vocab_size=glove_tokenizer.get_vocab_size(), num_label=NUM_EMO, hidden_dim=SRC_HIDDEN_DIM, attention_mode=ATTENTION, args=args ) if args.fix_emb: para_group = [ {'params': [p for n, p in model.named_parameters() if n.startswith("encoder") and not 'encoder.embeddings' in n], 'lr': args.en_lr}, {'params': [p for n, p in model.named_parameters() if n.startswith("decoder")], 'lr': args.de_lr}] else: para_group = [ {'params': [p for n, p in model.named_parameters() if n.startswith("encoder")], 'lr': args.en_lr}, {'params': [p for n, p in model.named_parameters() if n.startswith("decoder")], 'lr': args.de_lr}] loss_criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(para_group) if args.scheduler: epoch_to_step = int(len(train_set) / BATCH_SIZE) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_EPOCH * epoch_to_step, num_training_steps=STOP_EPOCH * epoch_to_step, min_lr_ratio=args.min_lr_ratio ) if args.glorot_init: logger('use glorot initialization') for group in para_group: nn_utils.glorot_init(group['params']) if args.huang_init: nn_utils.huang_init(model.named_parameters(), uniform=not args.normal_init) model.load_encoder_embedding(glove_tokenizer.get_embeddings(), fix_emb=args.fix_emb) model.cuda() # Start training EVAL_EVERY = int(len(train_set) / BATCH_SIZE / 4) best_model = None es = EarlyStopping(patience=PATIENCE) update_step = 0 exit_training = False for epoch in range(1, MAX_EPOCH + 1): train_pred = [] train_gold_list = [] logger('Training on epoch=%d -------------------------' % (epoch)) train_loss_sum = 0 # print('Current encoder learning rate', scheduler.get_lr()) # print('Current decoder learning rate', scheduler.get_lr()) for i, (src, src_len, trg) in tqdm(enumerate(train_loader), total=int(len(train_set) / BATCH_SIZE)): model.train() update_step += 1 # print('i=%d: ' % (i)) # trg = torch.index_select(trg, 1, torch.LongTensor(list(range(1, len(EMOS)+1)))) if args.scheduler: scheduler.step() optimizer.zero_grad() decoder_logit = model(src.cuda(), src_len.cuda()) train_pred.append(np.argmax(decoder_logit.data.cpu().numpy(), axis=-1)) gold = np.asarray(trg) trg_index = [] for i in range(gold.shape[0]): train_gold_list.append(gold[i]) loss = loss_criterion(decoder_logit, trg.view(-1).cuda()) loss.backward() train_loss_sum += loss.data.cpu().numpy() * src.shape[0] torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS) optimizer.step() if update_step % EVAL_EVERY == 0 and args.eval_every is not None: model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_set, y_dev) if exit_training: break logger(f"Training Loss for epoch {epoch}:", train_loss_sum / len(train_set)) if not train_pred == []: print('TRAIN---------: ') train_pred = np.concatenate(train_pred, axis=0) train_gold_list = np.array(train_gold_list) show_classification_report(train_gold_list, train_pred) # model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_set) if exit_training: break # final_testing model.eval() preds = [] logger("Testing:") for i, (src, src_len) in tqdm(enumerate(test_loader), total=int(len(test_set) / BATCH_SIZE)): with torch.no_grad(): decoder_logit = model(src.cuda(), src_len.cuda()) preds.append(np.argmax(decoder_logit.data.cpu().numpy(), axis=-1)) del decoder_logit preds = np.concatenate(preds, axis=0) gold = np.asarray(y_test) #preds = np.argmax(preds, axis=-1) logger("NOTE, this is on the test set") #metric = get_metrics(gold, preds) #logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # metric = get_multi_metrics(binary_gold, binary_preds) # logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # show_classification_report(binary_gold, binary_preds) # logger('Jaccard:', jaccard_score(gold, preds)) return gold, preds, model
def train(): """ train """ """ construct index-based data loader """ idx = np.array([i for i in range(args.seq_len + 1, data_obj.num_times)]) idx_dat = dat.TensorDataset(torch.tensor(idx, dtype=torch.int32)) train_idx_data_loader = dat.DataLoader(dataset=idx_dat, batch_size=args.batch_size, shuffle=True) idx = np.array([i for i in range(args.seq_len + 1, data_obj.num_times)]) idx_dat = dat.TensorDataset(torch.tensor(idx, dtype=torch.int32)) test_idx_data_loader = dat.DataLoader(dataset=idx_dat, batch_size=1, shuffle=False) """ set writer, loss function, and optimizer """ mse_loss_func = nn.MSELoss() mse_sum_loss_func = nn.MSELoss(reduction='sum') spatial_loss_func = SpatialLoss(sp_neighbor=args.sp_neighbor) temporal_loss_func = TemporalLoss(tp_neighbor=args.tp_neighbor) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) early_stopping = EarlyStopping(patience=args.patience, verbose=args.verbose) def construct_sequence_x(idx_list, dynamic_x, static_x): d_x = [dynamic_x[i - args.seq_len + 1: i + 1, ...] for i in idx_list] d_x = np.stack(d_x, axis=0) s_x = np.expand_dims(static_x, axis=0) s_x = np.repeat(s_x, args.seq_len, axis=1) # shape: (t, c, h, w) s_x = np.repeat(s_x, len(idx_list), axis=0) # shape: (b, t, c, h, w) x = np.concatenate([d_x, s_x], axis=2) return torch.tensor(x, dtype=torch.float).to(device) def construct_y(idx_list, output_y): y = [output_y[i] for i in idx_list] y = np.stack(y, axis=0) return torch.tensor(y, dtype=torch.float).to(device) """ training """ for epoch in range(args.num_epochs): model.train() total_losses, train_losses, val_losses, l1_losses, ae_losses, sp_losses = 0, 0, 0, 0, 0, 0 for _, idx in enumerate(train_idx_data_loader): batch_idx = idx[0] """ construct sequence input """ batch_x = construct_sequence_x(batch_idx, data_obj.dynamic_x, data_obj.static_x) # shape: (b, t, c, h, w) batch_y = construct_y(batch_idx, data_obj.train_y) # shape: (b, 1, h, w) batch_val_y = construct_y(batch_idx, data_obj.val_y) """ start train """ out, sparse_x, _, de_x, em = model(batch_x) train_loss = mse_loss_func(batch_y[~torch.isnan(batch_y)], out[~torch.isnan(batch_y)]) train_losses += train_loss.item() """ add loss according to the model type """ total_loss = train_loss if 'l1' in model_types: l1_loss = model.sparse_layer.l1_loss() l1_losses += l1_loss.item() total_loss += l1_loss * args.alpha if 'ae' in model_types: ae_loss = mse_sum_loss_func(sparse_x, de_x) ae_losses += ae_loss.item() total_loss += ae_loss * args.beta if 'sp' in model_types: sp_loss = spatial_loss_func(out) sp_losses += sp_loss.item() total_loss += sp_loss * args.gamma # if 'vg' in args.model_type: # # 1-step temporal neighboring loss # pre_batch_idx = batch_idx - torch.ones_like(batch_idx) # pre_batch_x = construct_sequence_x(pre_batch_idx, data_obj.dynamic_x, # data_obj.static_x) # x = (b, t, c, h, w) # _, _, _, _, pre_em = model(pre_batch_x) # tp_loss = torch.mean(torch.mean((em - pre_em) ** 2, axis=1)) # # # 1-step spatial neighboring loss # sp_loss = 0. # sp_loss += torch.mean(torch.mean((em[..., 1:, 1:] - em[..., :-1, :-1]) ** 2, axis=1)) # sp_loss += torch.mean(torch.mean((em[..., 1:, :] - em[..., :-1, :]) ** 2, axis=1)) # sp_loss += torch.mean(torch.mean((em[..., :, 1:] - em[..., :, :-1]) ** 2, axis=1)) # alosses.append(tp_loss.item() + sp_loss.item()) # total_loss += (tp_loss + sp_loss) * args.eta total_losses += total_loss.item() optimizer.zero_grad() total_loss.backward() optimizer.step() """ validate """ val_loss = mse_loss_func(batch_val_y[~torch.isnan(batch_val_y)], out[~torch.isnan(batch_val_y)]) val_losses += val_loss.item() if args.verbose: logging.info('Epoch [{}/{}] total_loss = {:.3f}, train_loss = {:.3f}, val_loss = {:.3f}, ' 'l1_losses = {:.3f}, ae_losses = {:.3f}, sp_losses = {:.3f}.' .format(epoch, args.num_epochs, total_losses, train_losses, val_losses, l1_losses, ae_losses, sp_losses)) # write for tensor board visualization if args.use_tb: tb_writer.add_scalar('data/train_loss', train_losses, epoch) tb_writer.add_scalar('data/val_loss', val_losses, epoch) # early_stopping early_stopping(val_losses, model, model_file) # evaluate testing data if len(data_obj.test_loc) == 0 and False: model.eval() prediction = [] with torch.no_grad(): for i, data in enumerate(test_idx_data_loader): batch_idx = data[0] batch_x = construct_sequence_x(batch_idx, data_obj.dynamic_x, data_obj.static_x) # (b, t, c, h, w) out, _, _, _, _ = model(batch_x) prediction.append(out.cpu().data.numpy()) prediction = np.concatenate(prediction) acc = compute_error(data_obj.test_y[args.seq_len + 1:, ...], prediction) if args.verbose: logging.info('Epoch [{}/{}] testing: rmse = {:.3f}, mape = {:.3f}, r2 = {:.3f}.' .format(epoch, args.num_epochs, *acc)) if early_stopping.early_stop: break
class App: def __init__(self, model, early_stopping=True): self.model = model if early_stopping: self.early_stopping = EarlyStopping(patience=100, verbose=True) def train(self, data, config, save_path='', mode=NODE_CLASSIFICATION): loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(self.model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) labels = data[LABELS] # initialize graph if mode == NODE_CLASSIFICATION: train_mask = data[TRAIN_MASK] val_mask = data[VAL_MASK] dur = [] for epoch in range(config['n_epochs']): self.model.train() if epoch >= 3: t0 = time.time() # forward logits = self.model(None) loss = loss_fcn(logits[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss = self.model.eval_node_classification( labels, val_mask) print( "Epoch {:05d} | Time(s) {:.4f} | Train loss {:.4f} | Val accuracy {:.4f} | " "Val loss {:.4f}".format(epoch, np.mean(dur), loss.item(), val_acc, val_loss)) self.early_stopping(val_loss, self.model, save_path) if self.early_stopping.early_stop: print("Early stopping") break elif mode == GRAPH_CLASSIFICATION: self.accuracies = np.zeros(10) graphs = data[GRAPH] # load all the graphs for k in range(10): # 10-fold cross validation start = int(len(graphs) / 10) * k end = int(len(graphs) / 10) * (k + 1) # testing batch testing_graphs = graphs[start:end] self.testing_labels = labels[start:end] self.testing_batch = dgl.batch(testing_graphs) # training batch training_graphs = graphs[:start] + graphs[end + 1:] training_labels = labels[list(range(0, start)) + list(range(end + 1, len(graphs)))] training_samples = list( map(list, zip(training_graphs, training_labels))) training_batches = DataLoader(training_samples, batch_size=config['batch_size'], shuffle=True, collate_fn=collate) dur = [] for epoch in range(config['n_epochs']): self.model.train() if epoch >= 3: t0 = time.time() losses = [] training_accuracies = [] for iter, (bg, label) in enumerate(training_batches): logits = self.model(bg) loss = loss_fcn(logits, label) losses.append(loss.item()) _, indices = torch.max(logits, dim=1) correct = torch.sum(indices == label) training_accuracies.append(correct.item() * 1.0 / len(label)) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss = self.model.eval_graph_classification( self.testing_labels, self.testing_batch) print( "Epoch {:05d} | Time(s) {:.4f} | Train acc {:.4f} | Train loss {:.4f} " "| Val accuracy {:.4f} | Val loss {:.4f}".format( epoch, np.mean(dur) if dur else 0, np.mean(training_accuracies), np.mean(losses), val_acc, val_loss)) is_better = self.early_stopping(val_loss, self.model, save_path) if is_better: self.accuracies[k] = val_acc if self.early_stopping.early_stop: print("Early stopping") break self.early_stopping.reset() else: raise RuntimeError def test(self, data, load_path='', mode=NODE_CLASSIFICATION): try: print('*** Load pre-trained model ***') self.model = load_checkpoint(self.model, load_path) except ValueError as e: print('Error while loading the model.', e) if mode == NODE_CLASSIFICATION: test_mask = data[TEST_MASK] labels = data[LABELS] acc, _ = self.model.eval_node_classification(labels, test_mask) else: acc = np.mean(self.accuracies) print("\nTest Accuracy {:.4f}".format(acc)) return acc
def train(self, data_train, data_valid, enable_es=1): with tf.Session(graph=self.graph) as session: tf.set_random_seed(1234) logger = Logger(session, self.summary_dir) # here you initialize the tensorflow saver that will be used in saving the checkpoints. # max_to_keep: defaults to keeping the 5 most recent checkpoints of your model saver = tf.train.Saver() early_stopping = EarlyStopping() if (self.restore == 1 and self.load(session, saver)): num_epochs_trained = self.model_graph.cur_epoch_tensor.eval( session) print('EPOCHS trained: ', num_epochs_trained) else: print('Initizalizing Variables ...') tf.global_variables_initializer().run() if (self.model_graph.cur_epoch_tensor.eval(session) == self.epochs ): return for cur_epoch in range( self.model_graph.cur_epoch_tensor.eval(session), self.epochs + 1, 1): print('EPOCH: ', cur_epoch) self.current_epoch = cur_epoch # beta=utils.sigmoid(cur_epoch- 50) beta = 1. losses, recons, cond_prior, KL_w, y_prior, L2_loss = self.train_epoch( session, logger, data_train, beta=beta) train_string = 'TRAIN | Loss: ' + str(losses) + \ ' | Recons: ' + str(recons) + \ ' | CP: ' + str(cond_prior) + \ ' | KL_w: ' + str(KL_w) + \ ' | KL_y: ' + str(y_prior) + \ ' | L2_loss: '+ str(L2_loss) # train_string = colored(train_string, 'red', attrs=['reverse', 'blink']) train_string = colored(train_string, 'red') if np.isnan(losses): print( 'Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.' ) print('Recons: ', recons) print('CP: ', cond_prior) print('KL_w: ', KL_w) print('KL_y: ', y_prior) sys.exit() loss_val, recons, cond_prior, KL_w, y_prior, L2_loss = self.valid_epoch( session, logger, data_valid, beta=beta) valid_string = 'VALID | Loss: ' + str(loss_val) + \ ' | Recons: ' + str(recons) + \ ' | CP: ' + str(cond_prior) + \ ' | KL_w: ' + str(KL_w) + \ ' | KL_y: ' + str(y_prior) + \ ' | L2_loss: '+ str(L2_loss) print(train_string) print(valid_string) if (cur_epoch > 0 and cur_epoch % 10 == 0): self.save( session, saver, self.model_graph.global_step_tensor.eval(session)) session.run(self.model_graph.increment_cur_epoch_tensor) #Early stopping if (enable_es == 1 and early_stopping.stop(loss_val)): print('Early Stopping!') break self.save(session, saver, self.model_graph.global_step_tensor.eval(session)) return
def fit(self, X, y=None): print('\nProcessing data...') self.data_train = data_utils.process_data(X, y, test_size=0) if self.config.plot: self.data_plot = self.data_train self.config.num_batches = self.data_train.num_batches( self.config.batch_size) if not self.config.isBuilt: self.config.restore = True self.build_model(self.data_train.height, self.data_train.width, self.data_train.num_channels) else: assert (self.config.height == self.data_train.height) and (self.config.width == self.data_train.width) and \ (self.config.num_channels == self.data_train.num_channels), \ 'Wrong dimension of data. Expected shape {}, and got {}'.format((self.config.height,self.config.width, \ self.config.num_channels), \ (self.data_train.height, self.data_train.width, \ self.data_train.num_channels) \ ) ''' ------------------------------------------------------------------------------- TRAIN THE MODEL ------------------------------------------------------------------------------------- ''' print('\nTraining a model...') with tf.Session(graph=self.graph) as session: tf.set_random_seed(self.config.seeds) self.session = session logger = Logger(self.session, self.config.log_dir) saver = tf.train.Saver() early_stopper = EarlyStopping(name='total loss', decay_fn=self.decay_fn) if (self.config.restore and self.load(self.session, saver)): load_config = file_utils.load_args(self.config.model_name, self.config.config_dir) self.config.update(load_config) num_epochs_trained = self.model_graph.cur_epoch_tensor.eval( self.session) print('EPOCHS trained: ', num_epochs_trained) else: print('Initializing Variables ...') tf.global_variables_initializer().run() for cur_epoch in range( self.model_graph.cur_epoch_tensor.eval(self.session), self.config.epochs + 1, 1): print('EPOCH: ', cur_epoch) self.current_epoch = cur_epoch losses_tr = self._train(self.data_train, self.session, logger) if np.isnan(losses_tr[0]): print( 'Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.' ) for lname, lval in zip(self.model_graph.losses, losses_tr): print(lname, lval) sys.exit() train_msg = 'TRAIN: \n' for lname, lval in zip(self.model_graph.losses, losses_tr): train_msg += str(lname) + ': ' + str(lval) + ' | ' print(train_msg) print() if (cur_epoch == 1) or ((cur_epoch % self.config.save_epoch == 0) and (cur_epoch != 0)): gc.collect() self.save( self.session, saver, self.model_graph.global_step_tensor.eval(self.session)) if self.config.plot: self.plot_latent(cur_epoch) self.session.run(self.model_graph.increment_cur_epoch_tensor) # Early stopping if (self.config.early_stopping and early_stopper.stop(losses_tr[0])): print('Early Stopping!') break if cur_epoch % self.config.colab_save == 0: if self.config.colab: self.push_colab() self.save(self.session, saver, self.model_graph.global_step_tensor.eval(self.session)) if self.config.plot: self.plot_latent(cur_epoch) if self.config.colab: self.push_colab() return
def __init__(self, model, early_stopping=True): self.model = model if early_stopping: self.early_stopping = EarlyStopping(patience=100, verbose=True)
def main(): args = parse_args() config_path = args.config_file_path config = get_config(config_path, new_keys_allowed=True) config.defrost() config.experiment_dir = os.path.join(config.log_dir, config.experiment_name) config.tb_dir = os.path.join(config.experiment_dir, 'tb') config.model.best_checkpoint_path = os.path.join(config.experiment_dir, 'best_checkpoint.pt') config.model.last_checkpoint_path = os.path.join(config.experiment_dir, 'last_checkpoint.pt') config.config_save_path = os.path.join(config.experiment_dir, 'segmentation_config.yaml') config.freeze() init_experiment(config) set_random_seed(config.seed) train_dataset = make_dataset(config.train.dataset) train_loader = make_data_loader(config.train.loader, train_dataset) val_dataset = make_dataset(config.val.dataset) val_loader = make_data_loader(config.val.loader, val_dataset) device = torch.device(config.device) model = make_model(config.model).to(device) optimizer = make_optimizer(config.optim, model.parameters()) scheduler = None loss_f = make_loss(config.loss) early_stopping = EarlyStopping( **config.stopper.params ) train_writer = SummaryWriter(log_dir=os.path.join(config.tb_dir, 'train')) val_writer = SummaryWriter(log_dir=os.path.join(config.tb_dir, 'val')) for epoch in range(1, config.epochs + 1): print(f'Epoch {epoch}') train_metrics = train(model, optimizer, train_loader, loss_f, device) write_metrics(epoch, train_metrics, train_writer) print_metrics('Train', train_metrics) val_metrics = val(model, val_loader, loss_f, device) write_metrics(epoch, val_metrics, val_writer) print_metrics('Val', val_metrics) early_stopping(val_metrics['loss']) if config.model.save and early_stopping.counter == 0: torch.save(model.state_dict(), config.model.best_checkpoint_path) print('Saved best model checkpoint to disk.') if early_stopping.early_stop: print(f'Early stopping after {epoch} epochs.') break if scheduler: scheduler.step() train_writer.close() val_writer.close() if config.model.save: torch.save(model.state_dict(), config.model.last_checkpoint_path) print('Saved last model checkpoint to disk.')
def train(pairs_batch_train, pairs_batch_dev, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, ctc_loss, batch_size, num_epochs, device, train_data_len, dev_data_len): clip = 1.0 tf_rate = 1 lambda_factor = 0.8 early_stopping = EarlyStopping(patience=10, verbose=False, delta=0) for epoch in range(1, 10): encoder.train() decoder.train() batch_loss_train = 0 batch_loss_dev = 0 for iteration, batch in enumerate(pairs_batch_train): pad_input_seqs, input_seq_lengths, pad_target_seqs, target_seq_lengths = batch pad_input_seqs, pad_target_seqs = pad_input_seqs.to( device), pad_target_seqs.to(device) train_loss = 0 encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() #print(torch.isnan(pad_input_seqs).any()) encoder_output, encoder_hidden, encoder_output_prob = encoder( pad_input_seqs, input_seq_lengths) decoder_input = torch.ones(batch_size, 1).long().to(device) decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True), encoder_hidden[1].sum(0, keepdim=True)) teacher_forcing = True if random.random() <= tf_rate else False attn_weights = F.softmax(torch.ones(encoder_output.size(1), 1, encoder_output.size(0)), dim=-1).to(device) if teacher_forcing: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden, attn_weights = decoder( decoder_input, decoder_hidden, encoder_output, attn_weights) target = pad_target_seqs.squeeze() train_loss += criterion(decoder_output, target[i]) decoder_input = pad_target_seqs[i].detach() else: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_output) _, topi = decoder_output.topk(1) target = pad_target_seqs.squeeze() train_loss += criterion(decoder_output, target[i]) decoder_input = topi.detach() # CTC LOSS targets = pad_target_seqs.squeeze().permute(1, 0) input_lengths = torch.ones( encoder_output_prob.size(1)) * encoder_output_prob.size(0) input_lengths = input_lengths.type(torch.LongTensor) target_seq_lengths = np.array(target_seq_lengths) target_lengths = torch.from_numpy(target_seq_lengths) train_loss_ctc = ctc_loss(encoder_output_prob, targets, input_lengths, target_lengths) loss = (0.8 * train_loss) + (0.2 * train_loss_ctc) #loss = train_loss batch_loss_train += loss.data ## backward step loss.backward() torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip) torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip) encoder_optimizer.step() decoder_optimizer.step() # CALCULATE EVALUATION with torch.no_grad(): encoder.eval() decoder.eval() for _, batch in enumerate(pairs_batch_dev): pad_input_seqs, input_seq_lengths, pad_target_seqs, target_seq_lengths = batch pad_input_seqs, pad_target_seqs = pad_input_seqs.to( device), pad_target_seqs.to(device) dev_loss = 0 encoder_output, encoder_hidden, encoder_output_prob = encoder( pad_input_seqs, input_seq_lengths) decoder_input = torch.ones(batch_size, 1).long().to(device) decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True), encoder_hidden[1].sum(0, keepdim=True)) teacher_forcing = True if random.random() <= tf_rate else False attn_weights = F.softmax(torch.ones(encoder_output.size(1), 1, encoder_output.size(0)), dim=-1).to(device) if teacher_forcing: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden, attn_weights = decoder( decoder_input, decoder_hidden, encoder_output, attn_weights) target = pad_target_seqs.squeeze() dev_loss += criterion(decoder_output, target[i]) decoder_input = pad_target_seqs[i].detach() else: for i in range(0, pad_target_seqs.size(0)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_output) _, topi = decoder_output.topk(1) target = pad_target_seqs.squeeze() dev_loss += criterion(decoder_output, target[i]) decoder_input = topi.detach() # CTC LOSS targets = pad_target_seqs.squeeze().permute(1, 0) input_lengths = torch.ones( encoder_output_prob.size(1)) * encoder_output_prob.size(0) input_lengths = input_lengths.type(torch.LongTensor) target_seq_lengths = np.array(target_seq_lengths) target_lengths = torch.from_numpy(target_seq_lengths) dev_loss_ctc = ctc_loss(encoder_output_prob, targets, input_lengths, target_lengths) loss_dev = (0.8 * dev_loss) + (0.2 * dev_loss_ctc) #loss_dev = dev_loss batch_loss_dev += loss_dev.data print('[Epoch: %d] train_loss: %.4f val_loss: %.4f' % (epoch + 1, (batch_loss_train.item() / (train_data_len / batch_size)), (batch_loss_dev.item() / (dev_data_len / batch_size)))) with open('loss/english_asr_finetuned.txt', 'a') as f: f.write( str(epoch + 1) + ' ' + str(batch_loss_train.item() / (train_data_len / batch_size)) + ' ' + str(batch_loss_dev.item() / (dev_data_len / batch_size)) + '\n') print('saving the models...') torch.save( { 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'encoder_optimizer': encoder_optimizer.state_dict(), 'decoder_optimizer': decoder_optimizer.state_dict(), }, 'weights/english_asr_finetuned/state_dict_' + str(epoch + 1) + '.pt')