def run_exp1_on_conditions(name, with_attention=False): run_path = join(path_configs['results_dir'], name, str(time.time())) writer = SummaryWriter(run_path) cifar_data = get_cifar_data(data_dir=path_configs['data_dir'], batch_size=exp1_hp['batch_size']) model = LogisticRegression(input_dim=cifar_data['x_size'][1], output_dim=len(cifar_data['classes']), with_attention=with_attention) model = move(model) loss_fn = exp1_hp['base_loss_fn']() optimizer = exp1_hp['optimizer'](model.parameters(), lr=exp1_hp['learning_rate']) for epoch in range(exp1_hp['num_epochs']): model.train() for (batch_num, (x, y)) in enumerate(cifar_data['train_loader']): x = move(x) y = move(y) y_hat = model(x) loss = loss_fn(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar('Train Loss', loss, epoch * len(cifar_data['train_loader']) + batch_num) model.eval() losses = [] accuracy = [] for (batch_num, (x, y)) in enumerate(cifar_data['test_loader']): x = move(x) y = move(y) y_hat = model(x) loss = loss_fn(y_hat, y) losses.append(loss) accuracy += (torch.argmax(y_hat, dim=1) == y).int().tolist() writer.add_scalar('Test Loss', sum(losses) / len(losses), epoch) writer.add_scalar('Test Accuracy', sum(accuracy) / len(accuracy), epoch)
def run_exp3_on_conditions(name, hidden_dim, depth, attn_bool_vector): run_path = join(path_configs['results_dir'], name, str(time.time())) writer = SummaryWriter(run_path) cifar_data = get_cifar_data(data_dir=path_configs['data_dir'], batch_size=exp3_hp['batch_size']) model = ConstantWidthDeepNet(input_dim=cifar_data['x_size'][1], hidden_dim=hidden_dim, depth=depth, output_dim=len(cifar_data['classes']), with_attention=attn_bool_vector) initial_weights = [ move(torch.clone(model.fetch_value_weights(i))) for i in range(model.depth) ] initial_norms = [torch.linalg.norm(w) for w in initial_weights] model = move(model) loss_fn = exp3_hp['base_loss_fn']() optimizer = exp3_hp['optimizer'](model.parameters(), lr=exp3_hp['learning_rate']) prev_intralayer_outputs = [] cumul_grad_per_neuron = [ move(torch.zeros(layer.V.weight.size())) for layer in model._modules['layers'] ] train_time_records = [] test_time_records = [] sporadic_records = pd.DataFrame( columns=['batch_idx', 'architecture', 'run', 'layer', 'activations']) sporadic_records.to_csv(join(run_path, 'sporadic.csv'), index=False) next_activation_batch_idx = 1 activation_batch_rate = 1.2 for epoch in range(exp3_hp['num_epochs']): gc.collect() model.train() test_time_record = { 'epoch': epoch, 'architecture': name, 'run': 0 } # TODO for (batch_num, (x, y)) in enumerate(cifar_data['train_loader']): x = move(x) y = move(y) y_hat, intralayer_outputs_by_layer = model(x) loss = loss_fn(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() step_idx = epoch * len(cifar_data['train_loader']) + batch_num train_time_record = { 'batch_idx': step_idx, 'architecture': name, 'run': 0 } # TODO # sporadic_record = train_time_record.copy() if prev_intralayer_outputs: for layer_idx, intralayer_outputs in enumerate( intralayer_outputs_by_layer): # if 'q_norm' in intralayer_outputs: # TODO: fix check to be not JANK # writer.add_scalar("Layer {} q norm".format(layer_idx), intralayer_outputs['q_norm'], step_idx) # writer.add_scalar("Layer {} k norm".format(layer_idx), intralayer_outputs['k_norm'], step_idx) # writer.add_scalar("Layer {} v norm".format(layer_idx), intralayer_outputs['v_norm'], step_idx) # writer.add_scalar("Layer {} q_hadamard_k norm".format(layer_idx), intralayer_outputs['q_hadamard_k_norm'], step_idx) # writer.add_scalar("Layer {} attn weight norm".format(layer_idx), intralayer_outputs['attn_weight_norm'], step_idx) # writer.add_scalar("Layer {} output norm".format(layer_idx), intralayer_outputs['output_norm'], step_idx) # # writer.add_scalar("Layer {} q mean".format(layer_idx), intralayer_outputs['q_mean'], step_idx) # writer.add_scalar("Layer {} k mean".format(layer_idx), intralayer_outputs['k_mean'], step_idx) # writer.add_scalar("Layer {} v mean".format(layer_idx), intralayer_outputs['v_mean'], step_idx) # writer.add_scalar("Layer {} q_hadamard_k mean".format(layer_idx), # intralayer_outputs['q_hadamard_k_mean'], step_idx) # writer.add_scalar("Layer {} attn weight mean".format(layer_idx), # intralayer_outputs['attn_weight_mean'], step_idx) # writer.add_scalar("Layer {} output mean".format(layer_idx), intralayer_outputs['output_mean'], # step_idx) X = intralayer_outputs['output'] Y = prev_intralayer_outputs[layer_idx]['output'] # calculate correlation with previous timestep's activations # X := current activations, BxD # Y := prev activations, BxD X_bar = torch.mean(X) Y_bar = torch.mean(Y) X_res = X - X_bar Y_res = Y - Y_bar X_mse = torch.square(X_res) Y_mse = torch.square(Y_res) X_std = torch.sqrt(1 / torch.numel(X) * torch.sum(X_mse)) Y_std = torch.sqrt(1 / torch.numel(X) * torch.sum(Y_mse)) cov = torch.mean(X_res * Y_res) corr = cov / (X_std * Y_std) train_time_record['layer_{}_correlation'.format( layer_idx)] = corr.item() writer.add_scalar( "Layer {} activation correlation".format(layer_idx), corr, step_idx) if step_idx >= int(next_activation_batch_idx): writer.add_histogram( "Layer {} activation distribution".format( layer_idx), X, step_idx) gini_coeff = batched_gini(X) train_time_record['layer_{}_gini'.format( layer_idx)] = gini_coeff writer.add_scalar("Layer {} Gini".format(layer_idx), gini_coeff, step_idx) quantile_dict = {} cum_quantile_dict = {} V_norm_dict = {} for i, layer in enumerate(model._modules['layers']): quantiles = np.quantile( layer.V.weight.grad.flatten().tolist(), exp3_hp['quantiles']) quantile_dict[i] = quantiles writer.add_scalar( "Layer {} 20th percentile gradient".format(i), quantiles[2], step_idx) writer.add_scalar( "Layer {} 50th percentile gradient".format(i), quantiles[5], step_idx) writer.add_scalar( "Layer {} 80th percentile gradient".format(i), quantiles[8], step_idx) cumul_grad_per_neuron[i] = ( layer.V.weight.grad + cumul_grad_per_neuron[i] * step_idx) / ( step_idx + 1) cum_quantiles = np.quantile( cumul_grad_per_neuron[i].flatten().tolist(), exp3_hp['quantiles']) cum_quantile_dict[i] = cum_quantiles writer.add_scalar( "Layer {} 20th percentile gradient average". format(i), cum_quantiles[2], step_idx) writer.add_scalar( "Layer {} 50th percentile gradient average". format(i), cum_quantiles[5], step_idx) writer.add_scalar( "Layer {} 80th percentile gradient average". format(i), cum_quantiles[8], step_idx) V_norm = torch.linalg.norm(layer.V.weight, ord='nuc') writer.add_scalar( "Layer {} weight norm".format({i}), V_norm, step_idx) V_norm_dict[i] = V_norm sporadic_df = pd.read_csv( join(run_path, 'sporadic.csv')) sporadic_df = sporadic_df.append( { 'batch_idx': step_idx, 'architecture': name, 'run': 0, 'layer': layer_idx, 'activations': X.cpu().detach().numpy(), 'gradient_quantiles': quantile_dict, 'v_norm': V_norm_dict }, ignore_index=True) sporadic_df.to_csv(join(run_path, 'sporadic.csv'), index=False) del sporadic_df gc.collect() del Y, Y_std, Y_res, Y_mse, Y_bar, X_std, X_res, X_mse, X_bar, corr, cov if step_idx >= int(next_activation_batch_idx): next_activation_batch_idx = max( step_idx, activation_batch_rate * next_activation_batch_idx) prev_intralayer_outputs = intralayer_outputs_by_layer writer.add_scalar('Train Loss', loss, step_idx) train_time_record['train_loss'] = loss.item() train_time_records.append(train_time_record) # sporadic_records.append(sporadic_record) del x, y, y_hat, loss model.eval() losses = [] accuracy = [] for (batch_num, (x, y)) in enumerate(cifar_data['test_loader']): x = move(x) y = move(y) y_hat, _ = model(x) loss = loss_fn(y_hat, y) losses.append(loss) accuracy += (torch.argmax(y_hat, dim=1) == y).int().tolist() del x, y, y_hat test_loss = (sum(losses) / len(losses)) test_accuracy = (sum(accuracy) / len(accuracy)) writer.add_scalar('Test Loss', test_loss, epoch) writer.add_scalar('Test Accuracy', test_accuracy, epoch) test_time_record['test_loss'] = test_loss.item() test_time_record['test_accuracy'] = test_accuracy for i in range(model.depth): dist = torch.linalg.norm( model.fetch_value_weights(i) - initial_weights[i]) / initial_norms[i] writer.add_scalar('Diligence of Layer ' + str(i), dist, epoch) test_time_record['layer_{}_diligence'.format(i)] = dist.item() test_time_records.append(test_time_record) train_time_df = pd.DataFrame(train_time_records) test_time_df = pd.DataFrame(test_time_records) # sporadic_df = pd.DataFrame(sporadic_records) train_time_df.to_csv(join(run_path, 'train_time.csv')) test_time_df.to_csv(join(run_path, '3000test2.csv'))
def run_exp31_on_conditions(run, hidden_dim, depth, attention_layers_as_bool, results_dir, train_time_list, test_time_list): # set up tensorboard if not any(attention_layers_as_bool): architecture = 'Fully Feedforward' else: d = {0: '0th', 1: '1st', 2: '2nd', 3: '3rd'} architecture = 'Attention in the {layer_ord} Layer'.format( layer_ord=d[attention_layers_as_bool.index(True)]) unique_run_name = architecture + ',' + str(run) tensorboard_run_path = join(results_dir, unique_run_name, str(time.time())) writer = SummaryWriter(tensorboard_run_path) # set up data, model, diligence calculations model = ConstantWidthDeepNet(input_dim=cifar_data['x_size'][1], hidden_dim=hidden_dim, depth=depth, output_dim=len(cifar_data['classes']), with_attention=attention_layers_as_bool) initial_weights = [ move(torch.clone(model.fetch_value_weights(i))) for i in range(model.depth) ] initial_norms = [torch.linalg.norm(w) for w in initial_weights] model = move(model) # set up training loss_fn = exp31_hp['base_loss_fn']() optimizer = exp31_hp['optimizer'](model.parameters(), lr=exp31_hp['learning_rate']) train_time_records = [] test_time_records = [] for epoch in range(exp31_hp['num_epochs']): gc.collect() test_time_record = { 'epoch': epoch, 'architecture': architecture, 'run': run } # TODO # train loop model.train() for (batch_num, (x, y)) in enumerate(cifar_data['train_loader']): x = move(x) y = move(y) y_hat, intralayer_outputs_by_layer = model(x) loss = loss_fn(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() step_idx = epoch * len(cifar_data['train_loader']) + batch_num train_time_record = { 'batch_idx': step_idx, 'architecture': architecture, 'run': 0 } # TODO writer.add_scalar('Train Loss', loss, step_idx) train_time_record['train_loss'] = loss.item() train_time_records.append(train_time_record) del x, y, y_hat, loss # test loop model.eval() losses = [] accuracy = [] for (batch_num, (x, y)) in enumerate(cifar_data['test_loader']): x = move(x) y = move(y) y_hat, _ = model(x) loss = loss_fn(y_hat, y) losses.append(loss) accuracy += (torch.argmax(y_hat, dim=1) == y).int().tolist() del x, y, y_hat # test-time metrics (part 1: performance) test_loss = (sum(losses) / len(losses)) test_accuracy = (sum(accuracy) / len(accuracy)) writer.add_scalar('Test Loss', test_loss, epoch) writer.add_scalar('Test Accuracy', test_accuracy, epoch) test_time_record['test_loss'] = test_loss.item() test_time_record['test_accuracy'] = test_accuracy del losses, accuracy, test_loss, test_accuracy # test-time metrics (part 2: diligence) for i in range(model.depth): dist = torch.linalg.norm( model.fetch_value_weights(i) - initial_weights[i]) / initial_norms[i] writer.add_scalar('Diligence of Layer ' + str(i), dist, epoch) test_time_record['layer_{}_diligence'.format(i)] = dist.item() del dist test_time_records.append(test_time_record) train_time_df = pd.DataFrame(train_time_records) test_time_df = pd.DataFrame(test_time_records) train_time_list.append(train_time_df) test_time_list.append(test_time_df)
def run_exp32_on_conditions(run, hidden_dim, depth, attention_layers_as_bool, results_dir, train_time_list, test_time_list, sporadic_list, model_params): # set up tensorboard if not any(attention_layers_as_bool): architecture = 'Fully Feedforward' else: d = {0: '0th', 1: '1st', 2: '2nd', 3: '3rd'} architecture = 'Attention in the {layer_ord} Layer'.format( layer_ord=d[attention_layers_as_bool.index(True)]) unique_run_name = architecture + ',' + str(run) tensorboard_run_path = join(results_dir, unique_run_name, str(time.time())) writer = SummaryWriter(tensorboard_run_path) # set up data, model, diligence calculations model = ConstantWidthDeepNet(input_dim=cifar_data['x_size'][1], hidden_dim=hidden_dim, depth=depth, output_dim=len(cifar_data['classes']), with_attention=attention_layers_as_bool) initial_weights = [ move(torch.clone(model.fetch_value_weights(i))) for i in range(model.depth) ] initial_norms = [torch.linalg.norm(w) for w in initial_weights] model = move(model) # set up training loss_fn = exp32_hp['base_loss_fn']() optimizer = exp32_hp['optimizer'](model.parameters(), lr=exp32_hp['learning_rate']) train_time_records = [] test_time_records = [] prev_intralayer_outputs = [] next_activation_batch_idx = 1 activation_batch_rate = 1.2 cumul_grad_per_neuron = [ move(torch.zeros(layer.V.weight.size())) for layer in model._modules['layers'] ] sporadic_records = [] # Get a sample of neurons in each layer neuron_layer_to_idxs = { layer_idx: random.sample(range(hidden_dim), exp32_hp['num_neurons_to_track']) for layer_idx in range(4) } neuron_layer_to_idxs[3] = list(range(10)) for epoch in range(exp32_hp['num_epochs']): # todo: comment out # if epoch > 0 and epoch % 20 == 0: # breakpoint() gc.collect() test_time_record = { 'epoch': epoch, 'architecture': architecture, 'run': run } # train loop model.train() for (batch_num, (x, y)) in enumerate(cifar_data['train_loader']): # ORDINARY TRAIN LOOP CALCULATIONS x = move(x) y = move(y) y_hat, intralayer_outputs_by_layer = model(x) loss = loss_fn(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() step_idx = epoch * len(cifar_data['train_loader']) + batch_num train_time_record = { 'batch_idx': step_idx, 'architecture': architecture, 'run': run } writer.add_scalar('Train Loss', loss, step_idx) train_time_record['train_loss'] = loss.item() # GET CONSISTENT TRAIN-TIME METRICS TODO # 1. Intralayer Norms # 2. Activation Correlations # 3. Save the subset of neuron-wise activations, gradients for time series. Computer autocorrelation post facto. # 4. Hoyer sparsity measure: https://arxiv.org/pdf/0811.4706v2.pdf, https://math.stackexchange.com/questions/117860/how-to-define-sparseness-of-a-vector if prev_intralayer_outputs: for layer_idx, intralayer_outputs in enumerate( intralayer_outputs_by_layer): # 1. Intralayer Norms if 'q_norm' in intralayer_outputs: # TODO: fix check to be not JANK writer.add_scalar( "Layer {} q_hadamard_k norm".format(layer_idx), intralayer_outputs['q_hadamard_k_norm'], step_idx) train_time_record['Layer {} q_hadamard_k norm'.format( layer_idx )] = intralayer_outputs['q_hadamard_k_norm'].item() # 2. Activation Correlations X = intralayer_outputs['output'] Y = prev_intralayer_outputs[layer_idx]['output'] N = torch.numel(X) # calculate correlation with previous timestep's activations # X := current activations, BxD # Y := prev activations, BxD X_bar = torch.mean(X) Y_bar = torch.mean(Y) X_res = X - X_bar Y_res = Y - Y_bar X_mse = torch.square(X_res) Y_mse = torch.square(Y_res) X_std = torch.sqrt(1 / N * torch.sum(X_mse)) Y_std = torch.sqrt(1 / N * torch.sum(Y_mse)) cov = torch.mean(X_res * Y_res) corr = cov / (X_std * Y_std) train_time_record['layer_{}_correlation'.format( layer_idx)] = corr.item() writer.add_scalar( "Layer {} activation correlation".format(layer_idx), corr, step_idx) # 3. Save subset of neurons' activations and gradients activation_slice = intralayer_outputs[ 'v_vector'][:, neuron_layer_to_idxs[layer_idx]].flatten( ).tolist() # takes B x hidden_dim -> B x sample_size train_time_record['layer_{}_neuron_level_activations'. format(layer_idx)] = activation_slice gradient_xsection = model.fetch_value_weights( layer_idx).grad # layer_(i+1)_width x layer_(i)_width # sample_size x layer_(i)_width gradient_xsection = gradient_xsection[ neuron_layer_to_idxs[layer_idx], :] gradient_norm_slice = torch.sum( torch.square(gradient_xsection), dim=1).squeeze().flatten().tolist() train_time_record['layer_{}_neuron_level_gradient_norms'. format(layer_idx)] = gradient_norm_slice # 4. Hoyer sparsity measure l1_X = torch.linalg.norm(X, ord=1) l2_X = torch.linalg.norm(X, ord=2) hoyer_sparsity = float( (math.sqrt(N) - l1_X / l2_X) / (math.sqrt(N) - 1)) train_time_record['layer_{}_hoyer_sparsity'.format( layer_idx)] = hoyer_sparsity writer.add_scalar( "Layer {} Hoyer sparsity".format(layer_idx), hoyer_sparsity, step_idx) del X, Y, Y_std, Y_res, Y_mse, Y_bar, X_std, X_res, X_mse, X_bar, corr, cov, N del gradient_xsection del l1_X, l2_X prev_intralayer_outputs = intralayer_outputs_by_layer # GET SPORADIC METRICS TODO # 1. Activation Distribution # 2. Gini Measure on Activations # 3. Gradient Distribution # 4. Norm, Stable Rank of Layer Weight Matrix if step_idx >= int(next_activation_batch_idx): gc.collect() sporadic_record = { 'batch_idx': step_idx, 'architecture': architecture, 'run': run } for layer_idx, intralayer_outputs in enumerate( intralayer_outputs_by_layer): # 1. Activation Distribution (can't add to record bc too big) writer.add_histogram( "Layer {} activation distribution".format(layer_idx), intralayer_outputs['output'], step_idx) # 2. Gini Measure on Activations gini_coeff = batched_gini(intralayer_outputs['output']) sporadic_record['layer_{}_gini'.format( layer_idx)] = gini_coeff writer.add_scalar("Layer {} Gini".format(layer_idx), gini_coeff, step_idx) # 3. Gradient Distribution. and # 4. Norm, Stable Rank of Layer Weight Matrix quantile_dict = {} cum_quantile_dict = {} V_nuc_norm_dict = {} V_stable_rank_dict = {} for i, layer in enumerate(model._modules['layers']): quantiles = np.quantile( layer.V.weight.grad.flatten().tolist(), exp32_hp['quantiles']) quantile_dict[i] = quantiles writer.add_scalar( "Layer {} 20th percentile gradient".format(i), quantiles[2], step_idx) writer.add_scalar( "Layer {} 50th percentile gradient".format(i), quantiles[5], step_idx) writer.add_scalar( "Layer {} 80th percentile gradient".format(i), quantiles[8], step_idx) cumul_grad_per_neuron[i] = ( layer.V.weight.grad + cumul_grad_per_neuron[i] * step_idx) / (step_idx + 1) cum_quantiles = np.quantile( cumul_grad_per_neuron[i].flatten().tolist(), exp32_hp['quantiles']) cum_quantile_dict[i] = cum_quantiles writer.add_scalar( "Layer {} 20th percentile gradient average".format(i), cum_quantiles[2], step_idx) writer.add_scalar( "Layer {} 50th percentile gradient average".format(i), cum_quantiles[5], step_idx) writer.add_scalar( "Layer {} 80th percentile gradient average".format(i), cum_quantiles[8], step_idx) V_nuc_norm = float( torch.linalg.matrix_norm(layer.V.weight, ord='nuc')) writer.add_scalar( "Layer {} weight nuclear norm".format({i}), V_nuc_norm, step_idx) V_nuc_norm_dict[i] = V_nuc_norm V_stable_rank = float( torch.linalg.matrix_norm(layer.V.weight, ord='fro') / torch.linalg.matrix_norm(layer.V.weight, ord=2)) writer.add_scalar( "Layer {} weight stable rank".format({i}), V_stable_rank, step_idx) V_stable_rank_dict[i] = V_stable_rank if isinstance(layer, AttendedLayer): # TODO: fix Q_nuc_norm = float( torch.linalg.matrix_norm(layer.attn.Q.weight, ord='nuc')) writer.add_scalar( "Layer {} Q nuclear norm".format({i}), Q_nuc_norm, step_idx) Q_stable_rank = float( torch.linalg.matrix_norm(layer.attn.Q.weight, ord='fro') / torch.linalg.matrix_norm(layer.attn.Q.weight, ord=2)) writer.add_scalar("Layer {} Q stable rank".format({i}), Q_stable_rank, step_idx) K_nuc_norm = float( torch.linalg.matrix_norm(layer.attn.K.weight, ord='nuc')) writer.add_scalar( "Layer {} K nuclear norm".format({i}), K_nuc_norm, step_idx) K_stable_rank = float( torch.linalg.matrix_norm(layer.attn.K.weight, ord='fro') / torch.linalg.matrix_norm(layer.attn.K.weight, ord=2)) writer.add_scalar("Layer {} K stable rank".format({i}), K_stable_rank, step_idx) del K_nuc_norm, K_stable_rank, Q_nuc_norm, Q_stable_rank, V_nuc_norm, V_stable_rank sporadic_record['gradient_quantiles'] = quantile_dict sporadic_record[ 'cumulative_gradient_quantiles'] = cum_quantile_dict sporadic_record['weight_nuclear_norms'] = V_nuc_norm_dict sporadic_record['weight_stable_rank'] = V_stable_rank_dict sporadic_records.append(sporadic_record) next_activation_batch_idx = max( step_idx, activation_batch_rate * next_activation_batch_idx) del x, y, y_hat, loss train_time_records.append(train_time_record) # test loop model.eval() losses = [] accuracy = [] for (batch_num, (x, y)) in enumerate(cifar_data['test_loader']): x = move(x) y = move(y) y_hat, _ = model(x) loss = loss_fn(y_hat, y) losses.append(loss) accuracy += (torch.argmax(y_hat, dim=1) == y).int().tolist() del x, y, y_hat # test-time metrics (part 1: performance) test_loss = (sum(losses) / len(losses)) test_accuracy = (sum(accuracy) / len(accuracy)) writer.add_scalar('Test Loss', test_loss, epoch) writer.add_scalar('Test Accuracy', test_accuracy, epoch) test_time_record['test_loss'] = test_loss.item() test_time_record['test_accuracy'] = test_accuracy del losses, accuracy, test_loss, test_accuracy # test-time metrics (part 2: diligence) for i in range(model.depth): dist = torch.linalg.norm( model.fetch_value_weights(i) - initial_weights[i]) / initial_norms[i] writer.add_scalar('Diligence of Layer ' + str(i), dist, epoch) test_time_record['layer_{}_diligence'.format(i)] = dist.item() del dist test_time_records.append(test_time_record) train_time_df = pd.DataFrame(train_time_records) test_time_df = pd.DataFrame(test_time_records) sporadic_df = pd.DataFrame(sporadic_records) train_time_list.append(train_time_df) test_time_list.append(test_time_df) sporadic_list.append(sporadic_df) model_params[(hidden_dim, architecture)] = model.cpu().state_dict()
def run_exp2_on_conditions(name, hidden_dim, with_attention1=False, with_attention2=False): run_path = join(path_configs['results_dir'], name, str(time.time())) writer = SummaryWriter(run_path) cifar_data = get_cifar_data(data_dir=path_configs['data_dir'], batch_size=exp2_hp['batch_size']) model = MLP(input_dim=cifar_data['x_size'][1], hidden_dim=hidden_dim, output_dim=len(cifar_data['classes']), with_attention1=with_attention1, with_attention2=with_attention2) initial_fc1 = move(torch.clone(model.fc1.weight)) initial_fc2 = move(torch.clone(model.fc2.weight)) initial_fc1_norm = torch.linalg.norm(initial_fc1) initial_fc2_norm = torch.linalg.norm(initial_fc2) model = move(model) loss_fn = exp2_hp['base_loss_fn']() optimizer = exp2_hp['optimizer'](model.parameters(), lr=exp2_hp['learning_rate']) for epoch in range(exp2_hp['num_epochs']): model.train() for (batch_num, (x, y)) in enumerate(cifar_data['train_loader']): x = move(x) y = move(y) y_hat = model(x) loss = loss_fn(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar( 'Train Loss', loss, epoch * len(cifar_data['train_loader']) + batch_num) model.eval() losses = [] accuracy = [] for (batch_num, (x, y)) in enumerate(cifar_data['test_loader']): x = move(x) y = move(y) y_hat = model(x) loss = loss_fn(y_hat, y) losses.append(loss) accuracy += (torch.argmax(y_hat, dim=1) == y).int().tolist() writer.add_scalar('Test Loss', sum(losses) / len(losses), epoch) writer.add_scalar('Test Accuracy', sum(accuracy) / len(accuracy), epoch) dist_1 = torch.linalg.norm(model.fc1.weight - initial_fc1) / initial_fc1_norm dist_2 = torch.linalg.norm(model.fc2.weight - initial_fc2) / initial_fc2_norm writer.add_scalar('Laziness of First Layer', dist_1, epoch) writer.add_scalar('Laziness of Second Layer', dist_2, epoch)
def conditional_move(x, cond): # check if profiler has space if cond: return move(x) else: return x.cpu()