def est_cps(cruise, bio_features, phys_features, max_ncp=150, min_dists=[5], kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', subsample_num=1, subsample_of=1, save_dir='../results/'): """ Estimate the locations of change points in the input biological and physical features for a single cruise. :param cruise: Name of the cruise the features are from. :param bio_features: Features for the biological data. :param phys_features: Features for the physical data. :param max_ncp: Maximum number of change points in a sequence. :param min_dists: List of minimum acceptable distances between change points. :param kernel_types: List containing 'Gaussian-Euclidean' (Gaussian RBF kernel) and/or 'Linear'. :param bw_method: Method to use for obtaining the bandwidth(s). Either 'rule-of-thumb' or 'list'. :param subsample_num: Subsample number being used. :param subsample_of: Number of subsamples previously generated for this cruise. :param save_dir: Top-level directory where the results will be stored. """ projection_dim = bio_features.shape[1] for min_dist in min_dists: # Perform change-point estimation on the physical data if not os.path.exists(os.path.join(save_dir, cruise)): os.makedirs(os.path.join(save_dir, cruise)) if phys_features is not None: cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features, n_cp=(1, min(max_ncp, int((len(phys_features)-1)/min_dist)-1)), kernel_type='linear', min_dist=min_dist, return_obj=True) for key in cps_phys.keys(): cps_phys[key] = cps_phys[key].flatten().tolist() save_path = os.path.join(save_dir, cruise, 'cps_phys.json') json.dump({'cps_phys': cps_phys, 'objs_phys': objs_phys}, open(save_path, 'w')) for kernel_type in kernel_types: # Get the bandwidth(s) (if applicable) if kernel_type != 'Linear': rot_bw, bws = get_bw_range(bio_features) all_bws = [rot_bw] if bw_method == 'rule-of-thumb' else bws else: all_bws = [0] for bw in all_bws: # Perform change-point estimation on the biological data cps_bio, objs_bio = cp_estimation.mkcpe(X=bio_features, n_cp=(1, min(max_ncp, int((len(bio_features)-1)/min_dist)-1)), kernel_type=kernel_type, bw=bw, min_dist=min_dist, return_obj=True) for key in cps_bio.keys(): cps_bio[key] = cps_bio[key].flatten().tolist() bw_short = 'rule-of-thumb_' + str(np.round(bw, 3)) if bw_method == 'rule-of-thumb' else \ str(np.round(bw, 3)) if subsample_of == 1: save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' + kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '.json') else: save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' + kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '_subsample_' + str(subsample_num+1) + '_of_' + str(subsample_of) + '.json') json.dump({'cps_bio': cps_bio, 'bw': bw, 'objs_bio': objs_bio}, open(save_path, 'w'))
def _alternating_optimization(self, x, cps, ncp): """ Estimate the change points for the given features if they are unknown and then return the resultant objective. """ self.model.zero_grad() obj_value = 0 for i in range(len(x)): features = opt_utils.compute_features(x[i], self.model)[0] if len(cps[i]) == 0: with torch.autograd.no_grad(): est_cps = cp_estimation.mkcpe( features.cpu().numpy(), n_cp=ncp[i], kernel_type='linear', min_dist=self.params.min_dist, return_obj=False).ravel() else: est_cps = cps[i] obj_value = obj_value - opt_utils.compute_obj( features, est_cps, self.params) obj_value = obj_value / len(x) obj_value.backward() return obj_value
def est_cps_objs(phys_data, max_cp, min_dist=5): """ Estimate the locations of 0-max_cp change points in the physical data and return the corresponding objective values. :param phys_data: Physical data on which to estimate change points. :param max_cp: Largest number of change points to estimate. :param min_dist: Minimum allowable distance between change points. :return: objs_phys: Objective values when setting the number of change points to each of 0, 1, 2,..., max_cp (or the maximum possible number of changes given that the minimum distance between change points is min_dist). """ phys_features = np.asarray(phys_data[['temp', 'salinity']]) phys_features = StandardScaler().fit_transform(phys_features) cps_phys, objs_phys = cp_estimation.mkcpe( X=phys_features, n_cp=(0, min(max_cp, int((len(phys_features) - 1) / min_dist) - 1)), kernel_type='linear', min_dist=min_dist, return_obj=True) for key in objs_phys: objs_phys[key] = objs_phys[key] / len(phys_features) return objs_phys
if args.bw is None: bw = np.median(sklearn.metrics.pairwise.pairwise_distances(train_loader.dataset.features[0].numpy()).reshape(-1)) print('Bandwidth from median heuristic: %0.2f' % bw) else: bw = args.bw # Set up the path where the results will be saved save_dir = args.save_path save_file = save_dir + str(bw) + '_' + str(args.data_difference) + '_' + args.kernel + '_' + str(args.min_dist) + '_' \ + str(args.seed) + '_' + str(args.window_size) + '_' + str(time.time()) if not os.path.exists(save_dir): os.makedirs(save_dir) # Estimate the change points in every sequence in the validation and test sets, evaluate the performance, and save the # results hausdorff1s = {'valid_hausdorff1': 0, 'test_hausdorff1': 0} frobeniuses = {'valid_frobenius': 0, 'test_frobenius': 0} for dataset_name, data_loader in zip(['valid', 'test'], [valid_loader, test_loader]): X = data_loader.dataset.features[0].cpu().numpy() true_cps = data_loader.dataset.true_change_points[0].numpy() all_cps, objs = cp_estimation.mkcpe(X, n_cp=len(true_cps), bw=bw, kernel_type=args.kernel, min_dist=args.min_dist, return_obj=True) hausdorff1s[dataset_name + '_hausdorff1'] = evaluation.compute_hausdorff1(all_cps.flatten(), true_cps) frobeniuses[dataset_name + '_frobenius'] = evaluation.compute_frobenius(all_cps.flatten(), true_cps, len(X)) print('Average Frobenius distance on the test set: %0.2f' % np.mean(frobeniuses['test_frobenius'])) results = opt_structures.Results(save_path=save_file + '_results.pickle') results.update(0, **hausdorff1s, **frobeniuses) results.save()
hausdorff1s = {'valid_hausdorff1': 0, 'test_hausdorff1': 0} frobeniuses = {'valid_frobenius': 0, 'test_frobenius': 0} for dataset_name, data_loader in zip(['valid', 'test'], [valid_loader, test_loader]): data_iter = iter(data_loader) while True: try: X, _, _, _, true_cps, _ = next(data_iter) except: break X = X[0].reshape(-1, X[0].shape[1] * X[0].shape[2]).numpy().astype('double') true_cps = true_cps[0] all_cps, obj = cp_estimation.mkcpe(X, n_cp=len(true_cps), bw=bw, kernel_type='gaussian-euclidean', min_dist=args.min_dist, return_obj=True) hausdorff1s[dataset_name + '_hausdorff1'] += evaluation.compute_hausdorff1( all_cps.flatten(), true_cps) frobeniuses[dataset_name + '_frobenius'] += evaluation.compute_frobenius( all_cps.flatten(), true_cps, len(X)) hausdorff1s[dataset_name + '_hausdorff1'] /= len(data_loader) frobeniuses[dataset_name + '_frobenius'] /= len(data_loader) print('Average Frobenius distance on the test set: %0.2f' % np.mean(frobeniuses['test_frobenius'])) results = opt_structures.Results(save_path=save_file + '_results.pickle') results.update(0, **hausdorff1s, **frobeniuses)
def evaluate_features(data, model, params): """ Evaluate the current performance of the model using the given data. :param data: Data object containing the training, validation, and test set dataloaders :param model: Model object containing the architecture used in training :param params: Parameters object :return: Dictionary of results with the accuracy, loss, f1 scores, and other dissimilarity measures on each dataset """ all_features = opt_utils.compute_all_features(data.train_labeled_loader, data.train_unlabeled_loader, data.valid_loader, data.test_loader, model) if len(all_features['train_labeled']['x']) > 0 and torch.max( all_features['train_labeled']['y'][0]) > 0: w = train_classifier.train( (all_features['train_labeled']['x'], all_features['train_labeled']['y']), (all_features['valid']['x'], all_features['valid']['y']), (None, None), None, params.num_classes, 100, loss_name='mnl', input_features=True)[5] else: w = None results = collections.Counter() dataset_names = ['train_labeled', 'train_unlabeled', 'valid', 'test'] for dataset_name in dataset_names: X = all_features[dataset_name]['x'] if 'y_true' in all_features[dataset_name]: y_true = all_features[dataset_name]['y_true'] cps_true = all_features[dataset_name]['cps_true'] else: y_true = all_features[dataset_name]['y'] cps_true = all_features[dataset_name]['cps'] if len(X) > 0: for i in range(len(X)): est_cps, obj = cp_estimation.mkcpe(X[i].numpy(), n_cp=len(cps_true[i]), kernel_type='linear', min_dist=params.min_dist, return_obj=True) est_cps = est_cps.flatten() true_cps_i = cps_true[i].numpy() X[i] = X[i].to(defaults.device) results[dataset_name + '_loss'] += obj / len(X[i]) results[dataset_name + '_penalized_loss'] += compute_penalized_loss( X[i], obj / len(X[i]), model, params).item() results[dataset_name + '_hausdorff1'] += compute_hausdorff1( est_cps, true_cps_i) results[dataset_name + '_frobenius'] += compute_frobenius( est_cps, true_cps_i, len(X[i])) if w is not None: results[dataset_name + '_accuracy'] += compute_num_correct_labels( X[i], y_true[i].to(defaults.device), torch.from_numpy(est_cps).to(defaults.device), w) else: results[dataset_name + '_accuracy'] = np.nan for key in results.keys(): if dataset_name in key: if 'accuracy' not in key: results[key] /= len(X) * 1.0 else: results[key] /= sum([len(y) for y in y_true]) else: results[dataset_name + '_loss'] = np.inf results[dataset_name + '_penalized_loss'] = np.inf results[dataset_name + '_hausdorff1'] = np.inf results[dataset_name + '_frobenius'] = np.inf results[dataset_name + '_accuracy'] = 0 return results