def get_results_bih_dataset(seqs, labels, hdim, num_clusters): """Returns a dataframe of clustering results on the ECG dataset.""" label_vocab, label_counts = np.unique(labels, return_counts=True) logging.info('Counts of labels in current run: %s', str(label_vocab) + ' ' + str(label_counts)) label_lookup = {l: i for i, l in enumerate(label_vocab)} cluster_ids = [label_lookup[l] for l in labels] model_fns = create_model_fns(hdim) padded = clustering.pad_seqs_to_matrix(seqs) max_seq_len = np.max([s.seq_len for s in seqs]) pca = sklearn.decomposition.PCA(n_components=hdim).fit( _drop_nan_rows(padded)) # pylint: disable=g-long-lambda model_fns['PCA'] = lambda s: pca.transform( _replace_nan_with_0(clustering.pad_seqs_to_matrix([s], max_seq_len)) ).flatten() # Get clustering results. results_df = clustering.get_results( seqs, num_clusters, cluster_ids, None, model_fns, include_tslearn=FLAGS.include_tslearn, include_slow_methods=FLAGS.include_tslearn_slow) logging.info(results_df) if FLAGS.plot_clusters: clustering.visualize_clusters( seqs, None, labels, model_fns, os.path.join(FLAGS.output_dir, 'visualization.png')) return results_df
def get_results_bih_dataset(hdim, num_clusters, pos_samples=25, neg_samples=25): """Returns ECG clustering results in a pandas DataFrame.""" seqs, labels = parse_examples_to_seq(hdim, pos_samples, neg_samples) model_fns = create_model_fns(hdim) padded = clustering.pad_seqs_to_matrix(seqs) max_seq_len = np.max([s.seq_len for s in seqs]) pca = sklearn.decomposition.PCA(n_components=hdim).fit(padded) # pylint: disable=g-long-lambda model_fns['PCA'] = lambda s: pca.transform( clustering.pad_seqs_to_matrix([s], max_seq_len)).flatten() # Get clustering results. results_df = clustering.get_results(seqs, num_clusters, labels, None, model_fns, include_tslearn=FLAGS.include_tslearn) print(results_df) clustering.visualize_clusters( seqs, None, labels, model_fns, os.path.join(FLAGS.output_dir, 'visualization.png')) return results_df
def get_results(cluster_center_eigvalues, cluster_center_dist_lower_bound, hidden_state_dim, input_dim, guessed_hidden_dim, num_clusters, guessed_num_clusters, min_seq_len, max_seq_len, num_sampled_seq_len, num_repeat, num_systems, cluster_radius, input_mean, input_stddev, output_noise_stddev, init_state_mean=0.0, init_state_stddev=0.0, generate_diagonalizable_only=False, random_seed=0, results_path=None): """Get results for varying sequence lengths. Args: cluster_center_eigvalues: List of lists of eigenvalues for each cluster. E.g. [[0.9,0.1], [0.5,0.1], [0.2,0.2], or None. If None, eigenvalues will be generated from uniform(-1,1) with respect to cluster_center_dist_lower_bound. cluster_center_dist_lower_bound: Desired distance lower bound between clusters. When generating cluster centers, try repeatedly until distance is greater than cluster_center_dist_lower_bound. hidden_state_dim: True hidden state dim. input_dim: The input dim. guessed_hidden_dim: Assumed hidden dim. If 0, use true hidden dim. num_clusters: True number of clusters. guessed_num_clusters: Desired number of clusters. If 0, use true number. min_seq_len: Min seq len in experiments. max_seq_len: Max seq len in experiments. num_sampled_seq_len: Number of sampled seq len values in between min and max seq len. num_repeat: Number of repeated experiments for each seq_len. num_systems: Number of dynamical system in each clustering experiments. cluster_radius: Expected distance of generated systems from cluster centers. input_mean: Scalar or 1D array of length hidden state dim. input_stddev: Scalar of 1D array of length hidden state dim. output_noise_stddev: Scalar. init_state_mean: Scalar or 1D array of length hidden state dim. init_state_stddev: Scalar of 1D array of length hidden state dim. random_seed: Random seed, integer. Returns: A pandas DataFrame with columns `method`, `seq_len`, `t_secs`, `failed_ratio`, and columns for clustering metrics such as `adj_mutual_info` and `v_measure`. The same method and seq_len will appear in num_repeat many rows. """ if cluster_center_eigvalues is not None: if len(cluster_center_eigvalues) <= 1: raise ValueError('Need at least two cluster centers.') cluster_center_eigvalues = np.array(cluster_center_eigvalues) if cluster_center_eigvalues.shape != (num_clusters, hidden_state_dim): raise ValueError( 'Cluter center eig has shape %s, expected (%d, %d).' % (str(cluster_center_eigvalues.shape), num_clusters, hidden_state_dim)) np.random.seed(random_seed) progress_bar = tqdm.tqdm(total=num_repeat * num_sampled_seq_len) # Generator for output sequences. gen = lds.SequenceGenerator(input_mean=input_mean, input_stddev=input_stddev, output_noise_stddev=output_noise_stddev, init_state_mean=init_state_mean, init_state_stddev=init_state_stddev) seq_len_vals = np.linspace(min_seq_len, max_seq_len, num_sampled_seq_len) seq_len_vals = [int(round(x)) for x in seq_len_vals] if guessed_hidden_dim == 0: guessed_hidden_dim = hidden_state_dim if guessed_num_clusters == 0: guessed_num_clusters = num_clusters results_dfs = [] for i in xrange(num_repeat): logging.info('---Starting experiments in repeat run #%d---', i) if cluster_center_eigvalues is not None: cluster_centers = [] for eig_val in cluster_center_eigvalues: c = lds.generate_linear_dynamical_system(hidden_state_dim, input_dim, eigvalues=eig_val) cluster_centers.append(c) else: cluster_centers = clustering.generate_cluster_centers( num_clusters, hidden_state_dim, input_dim, cluster_center_dist_lower_bound, diagonalizable=generate_diagonalizable_only) true_systems, true_cluster_ids = clustering.generate_lds_clusters( cluster_centers, num_systems, cluster_radius, diagonalizable=generate_diagonalizable_only) for seq_len in seq_len_vals: logging.info('Running experiment with seq_len = %d.', seq_len) seqs = [gen.generate_seq(s, seq_len=seq_len) for s in true_systems] # Create transform_fns. model_fns = create_model_fns(guessed_hidden_dim) pca = sklearn.decomposition.PCA( n_components=guessed_hidden_dim).fit( np.stack([s.outputs.flatten() for s in seqs], axis=0)) model_fns['PCA'] = _create_pca_model_fn(pca) transform_fns = collections.OrderedDict() for k in model_fns: transform_fns[k] = _compose_model_fn(model_fns[k]) # Get clustering results. results_df = clustering.get_results( seqs, guessed_num_clusters, true_cluster_ids, true_systems, transform_fns, FLAGS.include_tslearn, include_slow_methods=FLAGS.include_tslearn_slow) results_df['seq_len'] = seq_len results_df['n_guessed_clusters'] = guessed_num_clusters results_df['n_true_clusters'] = num_clusters results_df['true_hidden_dim'] = hidden_state_dim results_df['guessed_hidden_dim'] = guessed_hidden_dim results_dfs.append(results_df) logging.info('Results:\n%s', str(results_df)) plot_filepath = os.path.join( FLAGS.output_dir, 'cluster_visualization_run_%d_seq_len_%d.png' % (i, seq_len)) if FLAGS.plot_clusters: clustering.visualize_clusters(seqs, true_systems, true_cluster_ids, transform_fns, plot_filepath) progress_bar.update(1) if results_path: with open(results_path, 'w+') as f: pd.concat(results_dfs).to_csv(f, index=False) progress_bar.close() return pd.concat(results_dfs)