Example #1
0
def est_cps(cruise, bio_features, phys_features, max_ncp=150, min_dists=[5], kernel_types=['Gaussian-Euclidean'],
            bw_method='rule-of-thumb', subsample_num=1, subsample_of=1, save_dir='../results/'):
    """
    Estimate the locations of change points in the input biological and physical features for a single cruise.

    :param cruise: Name of the cruise the features are from.
    :param bio_features: Features for the biological data.
    :param phys_features: Features for the physical data.
    :param max_ncp: Maximum number of change points in a sequence.
    :param min_dists: List of minimum acceptable distances between change points.
    :param kernel_types: List containing 'Gaussian-Euclidean' (Gaussian RBF kernel) and/or 'Linear'.
    :param bw_method: Method to use for obtaining the bandwidth(s). Either 'rule-of-thumb' or 'list'.
    :param subsample_num: Subsample number being used.
    :param subsample_of: Number of subsamples previously generated for this cruise.
    :param save_dir: Top-level directory where the results will be stored.
    """
    projection_dim = bio_features.shape[1]
    for min_dist in min_dists:
        # Perform change-point estimation on the physical data
        if not os.path.exists(os.path.join(save_dir, cruise)):
            os.makedirs(os.path.join(save_dir, cruise))

        if phys_features is not None:
            cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features,
                                                      n_cp=(1, min(max_ncp, int((len(phys_features)-1)/min_dist)-1)),
                                                      kernel_type='linear', min_dist=min_dist, return_obj=True)
            for key in cps_phys.keys():
                cps_phys[key] = cps_phys[key].flatten().tolist()
            save_path = os.path.join(save_dir, cruise, 'cps_phys.json')
            json.dump({'cps_phys': cps_phys, 'objs_phys': objs_phys}, open(save_path, 'w'))

        for kernel_type in kernel_types:
            # Get the bandwidth(s) (if applicable)
            if kernel_type != 'Linear':
                rot_bw, bws = get_bw_range(bio_features)
                all_bws = [rot_bw] if bw_method == 'rule-of-thumb' else bws
            else:
                all_bws = [0]
            for bw in all_bws:
                # Perform change-point estimation on the biological data
                cps_bio, objs_bio = cp_estimation.mkcpe(X=bio_features,
                                                        n_cp=(1, min(max_ncp, int((len(bio_features)-1)/min_dist)-1)),
                                                        kernel_type=kernel_type, bw=bw, min_dist=min_dist,
                                                        return_obj=True)
                for key in cps_bio.keys():
                    cps_bio[key] = cps_bio[key].flatten().tolist()

                bw_short = 'rule-of-thumb_' + str(np.round(bw, 3)) if bw_method == 'rule-of-thumb' else \
                            str(np.round(bw, 3))
                if subsample_of == 1:
                    save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' +
                                             kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '.json')
                else:
                    save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' +
                                             kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '_subsample_' +
                                             str(subsample_num+1) + '_of_' + str(subsample_of) + '.json')
                json.dump({'cps_bio': cps_bio, 'bw': bw, 'objs_bio': objs_bio}, open(save_path, 'w'))
Example #2
0
    def _alternating_optimization(self, x, cps, ncp):
        """
        Estimate the change points for the given features if they are unknown and then return the resultant objective.
        """
        self.model.zero_grad()
        obj_value = 0
        for i in range(len(x)):
            features = opt_utils.compute_features(x[i], self.model)[0]

            if len(cps[i]) == 0:
                with torch.autograd.no_grad():
                    est_cps = cp_estimation.mkcpe(
                        features.cpu().numpy(),
                        n_cp=ncp[i],
                        kernel_type='linear',
                        min_dist=self.params.min_dist,
                        return_obj=False).ravel()
            else:
                est_cps = cps[i]

            obj_value = obj_value - opt_utils.compute_obj(
                features, est_cps, self.params)

        obj_value = obj_value / len(x)
        obj_value.backward()

        return obj_value
def est_cps_objs(phys_data, max_cp, min_dist=5):
    """
    Estimate the locations of 0-max_cp change points in the physical data and return the corresponding objective values.

    :param phys_data: Physical data on which to estimate change points.
    :param max_cp: Largest number of change points to estimate.
    :param min_dist: Minimum allowable distance between change points.
    :return: objs_phys: Objective values when setting the number of change points to each of 0, 1, 2,..., max_cp (or the
                        maximum possible number of changes given that the minimum distance between change points is
                        min_dist).
    """
    phys_features = np.asarray(phys_data[['temp', 'salinity']])
    phys_features = StandardScaler().fit_transform(phys_features)
    cps_phys, objs_phys = cp_estimation.mkcpe(
        X=phys_features,
        n_cp=(0, min(max_cp,
                     int((len(phys_features) - 1) / min_dist) - 1)),
        kernel_type='linear',
        min_dist=min_dist,
        return_obj=True)
    for key in objs_phys:
        objs_phys[key] = objs_phys[key] / len(phys_features)

    return objs_phys
Example #4
0
if args.bw is None:
    bw = np.median(sklearn.metrics.pairwise.pairwise_distances(train_loader.dataset.features[0].numpy()).reshape(-1))
    print('Bandwidth from median heuristic: %0.2f' % bw)
else:
    bw = args.bw

# Set up the path where the results will be saved
save_dir = args.save_path
save_file = save_dir + str(bw) + '_' + str(args.data_difference) + '_' + args.kernel + '_' + str(args.min_dist) + '_' \
            + str(args.seed) + '_' + str(args.window_size) + '_' + str(time.time())

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Estimate the change points in every sequence in the validation and test sets, evaluate the performance, and save the
# results
hausdorff1s = {'valid_hausdorff1': 0, 'test_hausdorff1': 0}
frobeniuses = {'valid_frobenius': 0, 'test_frobenius': 0}
for dataset_name, data_loader in zip(['valid', 'test'], [valid_loader, test_loader]):
    X = data_loader.dataset.features[0].cpu().numpy()
    true_cps = data_loader.dataset.true_change_points[0].numpy()
    all_cps, objs = cp_estimation.mkcpe(X, n_cp=len(true_cps), bw=bw,
                                        kernel_type=args.kernel, min_dist=args.min_dist, return_obj=True)
    hausdorff1s[dataset_name + '_hausdorff1'] = evaluation.compute_hausdorff1(all_cps.flatten(), true_cps)
    frobeniuses[dataset_name + '_frobenius'] = evaluation.compute_frobenius(all_cps.flatten(), true_cps, len(X))

print('Average Frobenius distance on the test set: %0.2f' % np.mean(frobeniuses['test_frobenius']))
results = opt_structures.Results(save_path=save_file + '_results.pickle')
results.update(0, **hausdorff1s, **frobeniuses)
results.save()
Example #5
0
hausdorff1s = {'valid_hausdorff1': 0, 'test_hausdorff1': 0}
frobeniuses = {'valid_frobenius': 0, 'test_frobenius': 0}
for dataset_name, data_loader in zip(['valid', 'test'],
                                     [valid_loader, test_loader]):
    data_iter = iter(data_loader)
    while True:
        try:
            X, _, _, _, true_cps, _ = next(data_iter)
        except:
            break
        X = X[0].reshape(-1, X[0].shape[1] *
                         X[0].shape[2]).numpy().astype('double')
        true_cps = true_cps[0]
        all_cps, obj = cp_estimation.mkcpe(X,
                                           n_cp=len(true_cps),
                                           bw=bw,
                                           kernel_type='gaussian-euclidean',
                                           min_dist=args.min_dist,
                                           return_obj=True)
        hausdorff1s[dataset_name +
                    '_hausdorff1'] += evaluation.compute_hausdorff1(
                        all_cps.flatten(), true_cps)
        frobeniuses[dataset_name +
                    '_frobenius'] += evaluation.compute_frobenius(
                        all_cps.flatten(), true_cps, len(X))
    hausdorff1s[dataset_name + '_hausdorff1'] /= len(data_loader)
    frobeniuses[dataset_name + '_frobenius'] /= len(data_loader)

print('Average Frobenius distance on the test set: %0.2f' %
      np.mean(frobeniuses['test_frobenius']))
results = opt_structures.Results(save_path=save_file + '_results.pickle')
results.update(0, **hausdorff1s, **frobeniuses)
Example #6
0
def evaluate_features(data, model, params):
    """
    Evaluate the current performance of the model using the given data.

    :param data: Data object containing the training, validation, and test set dataloaders
    :param model: Model object containing the architecture used in training
    :param params: Parameters object
    :return: Dictionary of results with the accuracy, loss, f1 scores, and other dissimilarity measures on each dataset
    """
    all_features = opt_utils.compute_all_features(data.train_labeled_loader,
                                                  data.train_unlabeled_loader,
                                                  data.valid_loader,
                                                  data.test_loader, model)
    if len(all_features['train_labeled']['x']) > 0 and torch.max(
            all_features['train_labeled']['y'][0]) > 0:
        w = train_classifier.train(
            (all_features['train_labeled']['x'],
             all_features['train_labeled']['y']),
            (all_features['valid']['x'], all_features['valid']['y']),
            (None, None),
            None,
            params.num_classes,
            100,
            loss_name='mnl',
            input_features=True)[5]
    else:
        w = None

    results = collections.Counter()
    dataset_names = ['train_labeled', 'train_unlabeled', 'valid', 'test']
    for dataset_name in dataset_names:
        X = all_features[dataset_name]['x']
        if 'y_true' in all_features[dataset_name]:
            y_true = all_features[dataset_name]['y_true']
            cps_true = all_features[dataset_name]['cps_true']
        else:
            y_true = all_features[dataset_name]['y']
            cps_true = all_features[dataset_name]['cps']
        if len(X) > 0:
            for i in range(len(X)):
                est_cps, obj = cp_estimation.mkcpe(X[i].numpy(),
                                                   n_cp=len(cps_true[i]),
                                                   kernel_type='linear',
                                                   min_dist=params.min_dist,
                                                   return_obj=True)

                est_cps = est_cps.flatten()
                true_cps_i = cps_true[i].numpy()
                X[i] = X[i].to(defaults.device)

                results[dataset_name + '_loss'] += obj / len(X[i])
                results[dataset_name +
                        '_penalized_loss'] += compute_penalized_loss(
                            X[i], obj / len(X[i]), model, params).item()
                results[dataset_name + '_hausdorff1'] += compute_hausdorff1(
                    est_cps, true_cps_i)
                results[dataset_name + '_frobenius'] += compute_frobenius(
                    est_cps, true_cps_i, len(X[i]))
                if w is not None:
                    results[dataset_name +
                            '_accuracy'] += compute_num_correct_labels(
                                X[i], y_true[i].to(defaults.device),
                                torch.from_numpy(est_cps).to(defaults.device),
                                w)
                else:
                    results[dataset_name + '_accuracy'] = np.nan
            for key in results.keys():
                if dataset_name in key:
                    if 'accuracy' not in key:
                        results[key] /= len(X) * 1.0
                    else:
                        results[key] /= sum([len(y) for y in y_true])

        else:
            results[dataset_name + '_loss'] = np.inf
            results[dataset_name + '_penalized_loss'] = np.inf
            results[dataset_name + '_hausdorff1'] = np.inf
            results[dataset_name + '_frobenius'] = np.inf
            results[dataset_name + '_accuracy'] = 0

    return results