dataset_path = args.dataset_path from params_fit import p # get parameters from params_save import S # class to save objects p.which_adversarial = args.which_adversarial p.out_dir = '../../models/SST/' p.num_iters = 100 p.signal_strength = args.signal_strength p.bias = "bias" p.seed = args.seed max_patience = 5 patience = 0 decoy_strength = args.decoy_strength seed(p) s = S(p) out_name = str(args.which_adversarial) + p._str(p) torch.cuda.set_device(args.gpu) inputs = data.Field(lower=True) answers = data.Field(sequential=False, unk_token=None) tv_datafields = [("text", inputs), ("label", answers)] train, dev, test = TabularDataset.splits( path=dataset_path, # the root directory where the data lies train='train_bias_SST.csv', validation="dev_bias_SST.csv", test="test_bias_SST.csv", format='csv', skip_header=False, fields=tv_datafields)
metavar='N', help='how many batches to wait before logging training status') parser.add_argument( '--regularizer_rate', type=float, default=0.0, metavar='N', help='how heavy to regularize lower order interaction (AKA color)') parser.add_argument('--grad_method', type=int, default=0, metavar='N', help='which gradient method is used - Grad or CD') args = parser.parse_args() s = S(args.epochs) use_cuda = not args.no_cuda and torch.cuda.is_available() regularizer_rate = args.regularizer_rate s.regularizer_rate = regularizer_rate num_blobs = 8 s.num_blobs = num_blobs s.seed = args.seed device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': 0, 'pin_memory': True, 'worker_init_fn': np.random.seed(12) } if use_cuda else {}
def fit(p): out_name = p._str(p) # generate random fname str before saving seed(p.seed) s = S_save(p) #################################################################### DATA ############################################################## # testing data should always be generated with the same seed if p.dset == 'gaussian': p.n_train = int(p.n_train_over_num_features * p.num_features) # warning - this reseeds! X_train, y_train, X_test, y_test, s.betastar = \ data.get_data_train_test(n_train=p.n_train, n_test=p.n_test, p=p.num_features, noise_std=p.noise_std, noise_distr=p.noise_distr, iid=p.iid, # parameters to be determined beta_type=p.beta_type, beta_norm=p.beta_norm, seed_for_training_data=p.seed, cov_param=p.cov_param) elif p.dset == 'pmlb': s.dset_name = regression_dsets_large_names[p.dset_num] seed(703858704) X, y = pmlb.fetch_data(s.dset_name, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) # get test set seed(p.seed) X_train, y_train = shuffle(X_train, y_train) p.num_features = X_train.shape[1] p.n_train = int(p.n_train_over_num_features * p.num_features) ''' while p.n_train <= X_train.shape[0]: X_train = np.vstack((X_train, 1e-3 * np.random.randn(X_train.shape[0], X_train.shape[1]))) y_train = np.vstack((y_train, y_train)) ''' if p.n_train > X_train.shape[0]: print('this value of n too large') exit(0) elif p.n_train <= 1: print('this value of n too small') exit(0) else: X_train = X_train[:p.n_train] y_train = y_train[:p.n_train] #################################################################### FITTING ############################################################## if not p.model_type == 'rf': # fit model if p.model_type == 'linear_sta': s.w = X_train.T @ y_train / X_train.shape[0] elif 'mdl' in p.model_type: if p.model_type == 'mdl_orig': U, sv, Vh = npl.svd(X_train / np.sqrt(p.n_train)) a = U.T @ y_train # / (np.sqrt(p.n_train) * p.noise_std) a = a[:sv.size] def mdl_loss(l): return np.sum( np.square(a) / (1 + np.square(sv) / l) + np.log(1 + np.square(sv) / l)) opt_solved = minimize(mdl_loss, x0=1e-10) s.lambda_opt = opt_solved.x s.loss_val = opt_solved.fun inv = npl.pinv(X_train.T @ X_train / p.n_train + s.lambda_opt * np.eye(p.num_features)) s.w = inv @ X_train.T @ y_train / p.n_train elif p.model_type == 'mdl_m1': eigenvals, eigenvecs = npl.eig(X_train.T @ X_train) var = p.noise_std**2 def mdl1_loss(l): inv = npl.pinv(X_train.T @ X_train + l * np.eye(p.num_features)) thetahat = inv @ X_train.T @ y_train mse_norm = npl.norm(y_train - X_train @ thetahat)**2 / (2 * var) theta_norm = npl.norm(thetahat)**2 / (2 * var) eigensum = 0.5 * np.sum(np.log((eigenvals + l) / l)) return mse_norm + theta_norm + eigensum opt_solved = minimize(mdl1_loss, x0=1e-10) s.lambda_opt = opt_solved.x s.loss_val = opt_solved.fun inv = npl.pinv(X_train.T @ X_train + s.lambda_opt * np.eye(p.num_features)) s.w = inv @ X_train.T @ y_train else: if p.model_type == 'ols': m = LinearRegression(fit_intercept=False) elif p.model_type == 'lasso': m = Lasso(fit_intercept=False, alpha=p.reg_param) elif p.model_type == 'ridge': if p.reg_param == -1: m = RidgeCV(fit_intercept=False, alphas=np.logspace(-3, 3, num=10, base=10)) else: m = Ridge(fit_intercept=False, alpha=p.reg_param) m.fit(X_train, y_train) if p.reg_param == -1: s.lambda_opt = m.alpha_ s.w = m.coef_ # save df if p.model_type == 'ridge': S = X_train @ np.linalg.pinv(X_train.T @ X_train + p.reg_param * np.eye(X_train.shape[1])) @ X_train.T s.df1 = np.trace(S @ S.T) s.df2 = np.trace(2 * S - S.T @ S) s.df3 = np.trace(S) else: s.df1 = min(p.n_train, p.num_features) s.df2 = s.df1 s.df3 = s.df1 print('here!') # store predictions and things about w # s.H_trace = np.trace(H) s.wnorm = np.linalg.norm(s.w) s.num_nonzero = np.count_nonzero(s.w) s.preds_train = X_train @ s.w s.preds_test = X_test @ s.w elif p.model_type == 'rf': rf = RandomForestRegressor(n_estimators=p.num_trees, max_depth=p.max_depth) rf.fit(X_train, y_train) s.preds_train = rf.predict(X_train) s.preds_test = rf.predict(X_test) # set things s.train_mse = metrics.mean_squared_error(s.preds_train, y_train) s.test_mse = metrics.mean_squared_error(s.preds_test, y_test) save(out_name, p, s)
def fit_vision(p): out_name = p._str(p) # generate random fname str before saving seed(p) use_cuda = torch.cuda.is_available() device = 'cuda' if use_cuda else 'cpu' # pick dataset and model print('loading dset...') train_loader, test_loader = data.get_data_loaders(p) X_train, Y_train_onehot = data.get_XY(train_loader) model = data.get_model(p, X_train, Y_train_onehot) init.initialize_weights(p, X_train, Y_train_onehot, model) # set up optimizer and freeze appropriate layers model, optimizer = optimization.freeze_and_set_lr(p, model, it=0) def reg_init(p): if p.lambda_reg == 0: return None # load the gan gan_dir = '/accounts/projects/vision/chandan/gan/mnist_dcgan' sys.path.insert(1, gan_dir) from dcgan import Discriminator D = Discriminator( ngpu=1 if torch.cuda.is_available() else 0).to(device) D.load_state_dict( torch.load(oj(gan_dir, 'weights/netD_epoch_99.pth'), map_location=device)) D = D.eval() return D def reg(p, it, model, D, device): if p.lambda_reg == 0: return 0 exs = model.exs.reshape(model.exs.shape[0], 1, 28, 28) # mnist-specific outputs = D(exs) # discriminator outputs 1 for real, 0 for fake loss = p.lambda_reg * torch.sum(1 - outputs) return loss model = model.to(device) criterion = nn.CrossEntropyLoss() if 'linear' in p.dset: criterion = nn.MSELoss() reg_model = reg_init(p) # things to record s = S(p) s.weight_names = models.get_weight_names(model) if p.siamese: s.exs = model.exs.data.cpu().numpy() # run print('training...') for i, it in enumerate(tqdm(range(0, p.num_iters))): # calc stats and record s.losses_train[it], s.accs_train[it], s.confidence_unn_train[ it], s.confidence_norm_train[it], s.margin_unn_train[ it], s.margin_norm_train[it] = stats.calc_loss_acc_margins( train_loader, p.batch_size, use_cuda, model, criterion, p.dset) s.losses_test[it], s.accs_test[it], s.confidence_unn_test[ it], s.confidence_norm_test[it], s.margin_unn_test[ it], s.margin_norm_test[it] = stats.calc_loss_acc_margins( test_loader, p.batch_size, use_cuda, model, criterion, p.dset, print_loss=True) # record weights weight_dict = deepcopy( {x[0]: x[1].data.cpu().numpy() for x in model.named_parameters()}) s.weights_first10[p.its[it]] = deepcopy( model.state_dict()[s.weight_names[0]][:20].cpu().numpy()) s.weight_norms[p.its[it]] = stats.layer_norms(model.state_dict()) if it % p.save_all_weights_freq == 0 or it == p.num_iters - 1 or it == 0 or ( it < p.num_iters_small and it % 2 == 0): # save first, last, jumps s.weights[p.its[it]] = weight_dict if not p.use_conv: s.mean_max_corrs[p.its[it]] = stats.calc_max_corr_input( X_train, Y_train_onehot, model) if p.save_singular_vals: # weight singular vals s.singular_val_dicts.append( get_singular_vals_from_weight_dict(weight_dict)) s.singular_val_dicts_cosine.append( get_singular_vals_kernels(weight_dict, 'cosine')) s.singular_val_dicts_rbf.append( get_singular_vals_kernels(weight_dict, 'rbf')) s.singular_val_dicts_lap.append( get_singular_vals_kernels(weight_dict, 'laplacian')) # activations singular vals act_var_dicts = calc_activation_dims( use_cuda, model, train_loader.dataset, test_loader.dataset, calc_activations=p.calc_activations) s.act_singular_val_dicts_train.append( act_var_dicts['train']['pca']) s.act_singular_val_dicts_test.append(act_var_dicts['test']['pca']) s.act_singular_val_dicts_train_rbf.append( act_var_dicts['train']['rbf']) s.act_singular_val_dicts_test_rbf.append( act_var_dicts['test']['rbf']) # reduced model if p.save_reduce: model_r = reduce_model(model) s.losses_train_r[it], s.accs_train_r[ it] = stats.calc_loss_acc_margins(train_loader, p.batch_size, use_cuda, model_r, criterion, p.dset)[:2] s.losses_test_r[it], s.accs_test_r[ it] = stats.calc_loss_acc_margins(test_loader, p.batch_size, use_cuda, model_r, criterion, p.dset)[:2] # training for batch_idx, (x, target) in enumerate(train_loader): optimizer.zero_grad() x = x.to(device) target = target.to(device) x, target = Variable(x), Variable(target) out = model(x) loss = criterion(out, target) + reg(p, it, model, reg_model, device) loss.backward() optimizer.step() # don't go through whole dataset if batch_idx > len( train_loader ) / p.saves_per_iter and it <= p.saves_per_iter * p.saves_per_iter_end + 1: break # set lr / freeze if it - p.num_iters_small in p.lr_ticks: model, optimizer = optimization.freeze_and_set_lr(p, model, it) if it % p.save_all_freq == 0: save(out_name, p, s) # check for need to flip dset if 'flip' in p.dset and it == p.num_iters // 2: print('flipped dset') s.flip_iter = p.num_iters // 2 # flip_iter tells when dset flipped train_loader, test_loader = data.get_data_loaders(p, it=s.flip_iter) X_train, Y_train_onehot = data.get_XY(train_loader) if p.flip_freeze: p.freeze = 'last' model, optimizer = optimization.freeze_and_set_lr(p, model, it) elif 'permute' in p.dset and it > 0 and p.its[it] % p.change_freq == 0: s.permute_rng.append(int(p.its[it])) train_loader, test_loader = data.get_data_loaders( p, it=s.permute_rng[-1]) X_train, Y_train_onehot = data.get_XY(train_loader) save(out_name, p, s)