feature_size = 32 lr = 0.01 dataset = generateDataset(n, m, num_instances, feature_size) A, b, G, h = createConstraintMatrix(m, n, budget) net = FacilityNN(input_shape=(n, feature_size), output_shape=(n, m)) optimizer = torch.optim.SGD(net.parameters(), lr=lr) # surrogate setup if training_method == 'surrogate': # A, b, G, h = LPCreateSurrogateConstraintMatrix(m, n) variable_size = n T_size = 8 # init_T = normalize_matrix(torch.rand(variable_size, T_size)) init_T = normalize_matrix_positive(torch.rand(variable_size, T_size)) T = torch.tensor(init_T, requires_grad=True) T_lr = lr T_optimizer = torch.optim.Adam([T], lr=T_lr) optimize_result = getOptimalDecision(n, m, torch.Tensor(sample_instance.c), sample_instance.d, sample_instance.f, budget=budget) optimal_x = torch.Tensor(optimize_result.x) xx = torch.autograd.Variable(optimal_x, requires_grad=True) d, f = sample_instance.d, sample_instance.f c = torch.Tensor(
def train_model(train_data, validate_data, test_data, lr=0.1, learning_model='random_walk_distribution', block_selection='coverage', n_epochs=150, batch_size=100, optimizer='adam', omega=4, training_method='surrogate-decision-focused', max_norm=0.1, block_cut_size=0.5, T_size=10): net2 = GCNPredictionNet2(feature_size) net2.train() sample_graph = train_data[0][0] init_T, init_s = torch.rand(sample_graph.number_of_edges(), T_size), torch.zeros( sample_graph.number_of_edges()) T, s = torch.tensor( normalize_matrix_positive(init_T), requires_grad=True ), torch.tensor( init_s, requires_grad=False ) # bias term s can cause infeasibility. It is not yet known how to resolve it. full_T, full_s = torch.eye(sample_graph.number_of_edges(), requires_grad=False), torch.zeros( sample_graph.number_of_edges(), requires_grad=False) T_lr = lr # ================ Optimizer ================ if optimizer == 'adam': optimizer = optim.Adam(net2.parameters(), lr=lr) T_optimizer = optim.Adam([T, s], lr=T_lr) # optimizer=optim.Adam(list(net2.parameters()) + [T], lr=lr) elif optimizer == 'sgd': optimizer = optim.SGD(net2.parameters(), lr=lr) T_optimizer = optim.SGD([T, s], lr=T_lr) elif optimizer == 'adamax': optimizer = optim.Adamax(net2.parameters(), lr=lr) T_optimizer = optim.Adamax([T, s], lr=T_lr) # scheduler = ReduceLROnPlateau(optimizer, 'min') scheduler = ReduceLROnPlateau(optimizer, 'min') T_scheduler = ReduceLROnPlateau(T_optimizer, 'min') training_loss_list, validating_loss_list, testing_loss_list = [], [], [] training_defender_utility_list, validating_defender_utility_list, testing_defender_utility_list = [], [], [] print("Training...") forward_time, qp_time, backward_time = 0, 0, 0 pretrain_epochs = 0 decay_rate = 0.95 for epoch in range(-1, n_epochs): epoch_forward_time, epoch_qp_time, epoch_backward_time = 0, 0, 0 if epoch <= pretrain_epochs: ts_weight = 1 df_weight = 0 else: ts_weight = decay_rate**(epoch - pretrain_epochs) df_weight = 1 - ts_weight for mode in ["training", "validating", "testing"]: if mode == "training": dataset = train_data epoch_loss_list = training_loss_list epoch_def_list = training_defender_utility_list if epoch > 0: net2.train() else: net2.eval() elif mode == "validating": dataset = validate_data epoch_loss_list = validating_loss_list epoch_def_list = validating_defender_utility_list net2.eval() elif mode == "testing": dataset = test_data epoch_loss_list = testing_loss_list epoch_def_list = testing_defender_utility_list net2.eval() else: raise TypeError("Not valid mode: {}".format(mode)) loss_list, def_obj_list = [], [] for iter_n in tqdm.trange(len(dataset)): G, Fv, coverage_prob, phi_true, path_list, cut, log_prob, unbiased_probs_true, previous_gradient = dataset[ iter_n] n, m = G.number_of_nodes(), G.number_of_edges() budget = G.graph['budget'] # ==================== Visualization =================== # if iter_n == 0 and mode == 'training': # from plot_utils import plot_graph, reduce_dimension # T_reduced = T.detach().numpy() # reduce_dimension(T.detach().numpy()) # plot_graph(G, T_reduced, epoch) # =============== Compute edge probabilities =========== Fv_torch = torch.as_tensor(Fv, dtype=torch.float) edge_index = torch.Tensor(list( nx.DiGraph(G).edges())).long().t() phi_pred = net2(Fv_torch, edge_index).view( -1 ) if epoch >= 0 else phi_true # when epoch < 0, testing the optimal loss and defender utility # phi_pred.require_grad = True unbiased_probs_pred = phi2prob( G, phi_pred) if epoch >= 0 else unbiased_probs_true biased_probs_pred = prob2unbiased( G, -coverage_prob, unbiased_probs_pred, omega=omega) # feeding negative coverage to be biased # =================== Compute loss ===================== log_prob_pred = torch.zeros(1) for path in path_list: for e in path: log_prob_pred -= torch.log( biased_probs_pred[e[0]][e[1]]) log_prob_pred /= len(path_list) loss = (log_prob_pred - log_prob)[0] # ============== COMPUTE DEFENDER UTILITY ============== single_data = dataset[iter_n] if epoch == -1: # optimal solution cut_size = m def_obj, def_coverage, ( single_forward_time, single_qp_time ) = getDefUtility( single_data, full_T, full_s, unbiased_probs_pred, learning_model, cut_size=cut_size, omega=omega, verbose=False, training_mode=False, training_method=training_method, block_selection=block_selection) # feed forward only single_forward_time, single_qp_time = 0, 0 # testing epoch so not counting the computation time elif mode == 'testing' or mode == "validating" or epoch <= 0: cut_size = m def_obj, def_coverage, ( single_forward_time, single_qp_time ) = getDefUtility( single_data, T, s, unbiased_probs_pred, learning_model, cut_size=cut_size, omega=omega, verbose=False, training_mode=False, training_method=training_method, block_selection=block_selection) # feed forward only single_forward_time, single_qp_time = 0, 0 # testing epoch so not counting the computation time else: if training_method == 'decision-focused' or training_method == 'surrogate-decision-focused': cut_size = m else: raise TypeError('Not defined method') def_obj, def_coverage, ( single_forward_time, single_qp_time) = getDefUtility( single_data, T, s, unbiased_probs_pred, learning_model, cut_size=cut_size, omega=omega, verbose=False, training_mode=True, training_method=training_method, block_selection=block_selection ) # most time-consuming part epoch_forward_time += single_forward_time epoch_qp_time += single_qp_time def_obj_list.append(def_obj.item()) loss_list.append(loss.item()) if (iter_n % batch_size == (batch_size - 1)) and ( epoch > 0) and (mode == "training"): backward_start_time = time.time() optimizer.zero_grad() T_optimizer.zero_grad() try: if training_method == "decision-focused" or training_method == "surrogate-decision-focused": (-def_obj).backward() # (-def_obj * df_weight + loss * ts_weight).backward() else: raise TypeError("Not Implemented Method") # torch.nn.utils.clip_grad_norm_(net2.parameters(), max_norm=max_norm) # gradient clipping for parameter in net2.parameters(): parameter.grad = torch.clamp(parameter.grad, min=-max_norm, max=max_norm) T.grad = torch.clamp(T.grad, min=-max_norm, max=max_norm) optimizer.step() T_optimizer.step() except: print("no grad is backpropagated...") epoch_backward_time += time.time() - backward_start_time # ============== normalize T matrix ================= T.data = normalize_matrix_positive(T.data) # T.data = normalize_matrix_qr(T.data) # s.data = normalize_vector(s.data, max_value=budget) # print(s.data) # ========= scheduler using validation set ========== if (epoch > 0) and (mode == "validating"): if training_method == "decision-focused" or training_method == "surrogate-decision-focused": scheduler.step(-np.mean(def_obj_list)) T_scheduler.step(-np.mean(def_obj_list)) else: raise TypeError("Not Implemented Method") # ======= Storing loss and defender utility ========= epoch_loss_list.append(np.mean(loss_list)) epoch_def_list.append(np.mean(def_obj_list)) # ========== Print stuff after every epoch ========== np.random.shuffle(dataset) print("Mode: {}/ Epoch number: {}/ Loss: {}/ DefU: {}".format( mode, epoch, np.mean(loss_list), np.mean(def_obj_list))) print('Forward time for this epoch: {}'.format(epoch_forward_time)) print('QP time for this epoch: {}'.format(epoch_qp_time)) print('Backward time for this epoch: {}'.format(epoch_backward_time)) if epoch >= 0: forward_time += epoch_forward_time qp_time += epoch_qp_time backward_time += epoch_backward_time # ============= early stopping criteria ============= kk = 3 if epoch >= kk * 2 - 1: GE_counts = np.sum( np.array(validating_defender_utility_list[1:][-kk:]) <= np.array(validating_defender_utility_list[1:][-2 * kk:-kk]) + 1e-4) print( 'Generalization error increases counts: {}'.format(GE_counts)) if GE_counts == kk: break average_nodes = np.mean([x[0].number_of_nodes() for x in train_data] + [x[0].number_of_nodes() for x in validate_data] + [x[0].number_of_nodes() for x in test_data]) average_edges = np.mean([x[0].number_of_edges() for x in train_data] + [x[0].number_of_edges() for x in validate_data] + [x[0].number_of_edges() for x in test_data]) print('Total forward time: {}'.format(forward_time)) print('Total qp time: {}'.format(qp_time)) print('Total backward time: {}'.format(backward_time)) return net2, training_loss_list, validating_loss_list, testing_loss_list, training_defender_utility_list, validating_defender_utility_list, testing_defender_utility_list, ( forward_time, qp_time, backward_time), epoch
def surrogate_train_portfolio(model, covariance_model, T_init, optimizer, T_optimizer, epoch, dataset, training_method='surrogate', device='cpu', evaluate=False): model.train() covariance_model.train() loss_fn = torch.nn.MSELoss() train_losses, train_objs = [], [] forward_time, inference_time, qp_time, backward_time = 0, 0, 0, 0 T_size = T_init.shape[1] with tqdm.tqdm(dataset) as tqdm_loader: for batch_idx, (features, covariance_mat, labels) in enumerate(tqdm_loader): forward_start_time = time.time() features, covariance_mat, labels = features[0].to(device), covariance_mat[0].to(device), labels[0,:,0].to(device).float() # only one single data n = len(covariance_mat) Q_real = computeCovariance(covariance_mat) * (1 - REG) + torch.eye(n) * REG predictions = model(features.float())[:,0] loss = loss_fn(predictions, labels) # randomly select column to update # T = init_T T = T_init.detach().clone() random_column = torch.randint(T_init.shape[1], [1]) T[:,random_column] = T_init[:,random_column] Q = covariance_model() * (1 - REG) + torch.eye(n) * REG forward_time += time.time() - forward_start_time inference_start_time = time.time() p = predictions @ T L = sqrtm(T.t() @ Q @ T) # torch.cholesky(T.t() @ Q @ T) # =============== solving QP using qpth ================ if solver == 'qpth': G = -torch.eye(n) @ T h = torch.zeros(n) A = torch.ones(1,n) @ T b = torch.ones(1) qp_solver = qpth.qp.QPFunction() y = qp_solver(alpha * T.t() @ Q @ T, -p, G, h, A, b)[0] x = T @ y # =============== solving QP using CVXPY =============== elif solver == 'cvxpy': y_var = cp.Variable(T_size) L_para = cp.Parameter((T_size,T_size)) p_para = cp.Parameter(T_size) T_para = cp.Parameter((n,T_size)) constraints = [T_para @ y_var >= 0, cp.sum(T_para @ y_var) == 1] objective = cp.Minimize(0.5 * alpha * cp.sum_squares(L_para @ y_var) + p_para.T @ y_var) problem = cp.Problem(objective, constraints) cvxpylayer = CvxpyLayer(problem, parameters=[L_para, p_para, T_para], variables=[y_var]) y, = cvxpylayer(L, -p, T) x = T @ y # print("predicted objective value:", predictions.t() @ x - 0.5 * alpha * x.t() @ Q @ x) obj = labels @ x - 0.5 * alpha * x.t() @ Q_real @ x # print("real objective value:", obj) inference_time += time.time() - inference_start_time # ====================== back-prop ===================== optimizer.zero_grad() T_optimizer.zero_grad() backward_start_time = time.time() try: if training_method == 'surrogate': covariance = computeCovariance(T.t()) T_weight = 0.0 TS_weight = 0.0 T_loss = torch.sum(covariance) - torch.sum(torch.diag(covariance)) (-obj + T_weight * T_loss).backward() for parameter in model.parameters(): parameter.grad = torch.clamp(parameter.grad, min=-MAX_NORM, max=MAX_NORM) for parameter in covariance_model.parameters(): parameter.grad = torch.clamp(parameter.grad, min=-MAX_NORM, max=MAX_NORM) T_init.grad = torch.clamp(T_init.grad, min=-T_MAX_NORM, max=T_MAX_NORM) else: raise ValueError('Not implemented method') except: print("no grad is backpropagated...") pass optimizer.step() T_optimizer.step() T_init.data = normalize_matrix_positive(T_init.data) backward_time += time.time() - backward_start_time train_losses.append(loss.item()) train_objs.append(obj.item()) tqdm_loader.set_postfix(loss=f'{loss.item():.6f}', obj=f'{obj.item()*100:.6f}%', T_loss=f'{T_loss:.3f}') average_loss = np.mean(train_losses) average_obj = np.mean(train_objs) return average_loss, average_obj, (forward_time, inference_time, qp_time, backward_time)
def surrogate_train_submodular(net, init_T, optimizer, T_optimizer, epoch, sample_instance, dataset, lr=0.1, training_method='two-stage', device='cpu'): net.train() # loss_fn = torch.nn.BCELoss() loss_fn = torch.nn.MSELoss() train_losses, train_objs, train_T_losses = [], [], [] x_size, variable_size = init_T.shape n, m, d, f, budget = sample_instance.n, sample_instance.m, torch.Tensor( sample_instance.d), torch.Tensor( sample_instance.f), sample_instance.budget A, b, G, h = createSurrogateConstraintMatrix(m, n, budget) forward_time, inference_time, qp_time, backward_time = 0, 0, 0, 0 with tqdm.tqdm(dataset) as tqdm_loader: for batch_idx, (features, labels) in enumerate(tqdm_loader): forward_start_time = time.time() features, labels = features.to(device), labels.to(device) if epoch >= 0: outputs = net(features) else: outputs = labels # two-stage loss loss = loss_fn(outputs, labels) forward_time += time.time() - forward_start_time # decision-focused loss objective_value_list, T_loss_list = [], [] batch_size = len(labels) # randomly select column to update T = init_T # T = init_T.detach().clone() # random_column = torch.randint(init_T.shape[1], [1]) # T[:,random_column] = init_T[:,random_column] # if batch_idx == 0: # plot_graph(labels.detach().numpy(), T.detach().numpy(), epoch) for (label, output) in zip(labels, outputs): if training_method == 'surrogate': # output = label # for debug only # TODO inference_start_time = time.time() optimize_result = getSurrogateOptimalDecision( T, n, m, output, d, f, budget=budget) # end-to-end for both T and net inference_time += time.time() - inference_start_time optimal_y = torch.Tensor(optimize_result.x) qp_start_time = time.time() if optimize_result.success: optimal_y = torch.Tensor(optimize_result.x) newA, newb = torch.Tensor(), torch.Tensor() newG = torch.cat( (A @ T, G @ T, -torch.eye(variable_size))) newh = torch.cat((b, h, torch.zeros(variable_size))) # newG = torch.cat((A @ T, G @ T, -torch.eye(variable_size), torch.eye(variable_size))) # newh = torch.cat((b, h, torch.zeros(variable_size), torch.ones(variable_size))) Q = getSurrogateHessian( T, optimal_y, n, m, output, d, f).detach() + torch.eye(len(optimal_y)) * 10 L = torch.cholesky(Q) jac = -getSurrogateDerivative(T, optimal_y, n, m, output, d, f, create_graph=True) p = jac - Q @ optimal_y qp_solver = qpth.qp.QPFunction() # TODO unknown bug try: y = qp_solver(Q, p, newG, newh, newA, newb)[0] x = T @ y except: y = optimal_y x = T.detach() @ optimal_y print('qp error! no gradient!') # if True: # # =============== solving QP using CVXPY =============== # y_default = cp.Variable(variable_size) # G_default, h_default = cp.Parameter(newG.shape), cp.Parameter(newh.shape) # L_default = cp.Parameter((variable_size, variable_size)) # p_default = cp.Parameter(variable_size) # constraints = [G_default @ y_default <= h_default] # objective = cp.Minimize(0.5 * cp.sum_squares(L_default @ y_default) + p_default.T @ y_default) # problem = cp.Problem(objective, constraints) # cvxpylayer = CvxpyLayer(problem, parameters=[G_default, h_default, L_default, p_default], variables=[y_default]) # coverage_qp_solution, = cvxpylayer(newG, newh, L, p) # y = coverage_qp_solution # x = T @ y # time test... # time_test_start = time.time() # for i in range(20): # _ = getDerivative(x, n, m, output, d, f) # print('original gradient time:', time.time() - time_test_start) # time_test_start = time.time() # for i in range(20): # _ = getSurrogateDerivative(T, y, n, m, output, d, f, create_graph=False) # print('surrogate gradient time:', time.time() - time_test_start) # except: # print("CVXPY solver fails... Usually because Q is not PSD") # y = optimal_y # x = T.detach() @ optimal_y else: # torch.norm(y.detach() - optimal_y) > 0.05: # TODO print('Optimization failed...') y = optimal_y x = T.detach() @ optimal_y qp_time += time.time() - qp_start_time else: raise ValueError('Not implemented method!') obj = getObjective(x, n, m, label, d, f) tmp_T_loss = 0 # torch.sum((projected_real_optimal_x - real_optimal_x) ** 2).item() objective_value_list.append(obj) T_loss_list.append(tmp_T_loss) # print(pairwise_distances(T.t().detach().numpy())) objective = sum(objective_value_list) / batch_size T_loss = torch.Tensor([0]) # print('objective', objective) optimizer.zero_grad() backward_start_time = time.time() try: if training_method == 'two-stage': loss.backward() optimizer.step() elif training_method == 'decision-focused': (-objective).backward() for parameter in net.parameters(): parameter.grad = torch.clamp(parameter.grad, min=-MAX_NORM, max=MAX_NORM) optimizer.step() elif training_method == 'surrogate': covariance = computeCovariance(T.t()) T_loss = torch.sum(covariance) - torch.sum( torch.diag(covariance)) T_optimizer.zero_grad() (-objective).backward() # T_loss.backward() # TODO: minimizing reparameterization loss for parameter in net.parameters(): parameter.grad = torch.clamp(parameter.grad, min=-MAX_NORM, max=MAX_NORM) init_T.grad = torch.clamp(init_T.grad, min=-MAX_NORM, max=MAX_NORM) optimizer.step() T_optimizer.step() init_T.data = normalize_matrix_positive(init_T.data) else: raise ValueError('Not implemented method') except: print("Error! No grad is backpropagated...") pass backward_time += time.time() - backward_start_time train_losses.append(loss.item()) train_objs.append(objective.item()) train_T_losses.append(T_loss.item()) average_loss = np.mean(train_losses) average_obj = np.mean(train_objs) average_T_loss = np.mean(train_T_losses) # Print status # tqdm_loader.set_postfix(loss=f'{average_loss:.3f}', obj=f'{average_obj:.3f}') tqdm_loader.set_postfix(loss=f'{average_loss:.3f}', obj=f'{average_obj:.3f}', T_loss=f'{average_T_loss:.3f}') average_loss = np.mean(train_losses) average_obj = np.mean(train_objs) return average_loss, average_obj, (forward_time, inference_time, qp_time, backward_time)
train_dataset, validate_dataset, test_dataset = generateDataset( sp500_data, n=n, num_samples=num_samples) feature_size = train_dataset.dataset[0][0].shape[1] model = PortfolioModel(input_size=feature_size, output_size=1) covariance_model = CovarianceModel(n=n) optimizer = torch.optim.Adam(list(model.parameters()) + list(covariance_model.parameters()), lr=lr) scheduler = ReduceLROnPlateau(optimizer, 'min') if training_method == 'surrogate': T_size = args.T_size init_T = normalize_matrix_positive(torch.rand(n, T_size)) T = torch.tensor(init_T, requires_grad=True) T_lr = lr T_optimizer = torch.optim.Adam([T], lr=T_lr) T_scheduler = ReduceLROnPlateau(T_optimizer, 'min') train_loss_list, train_obj_list = [], [] test_loss_list, test_obj_list = [], [] validate_loss_list, validate_obj_list = [], [] print('n: {}, lr: {}'.format(n, lr)) print('Start training...') evaluate = False if training_method == 'two-stage' else True total_forward_time, total_inference_time, total_qp_time, total_backward_time = 0, 0, 0, 0 forward_time_list, inference_time_list, qp_time_list, backward_time_list = [], [], [], [] for epoch in range(-1, num_epochs):