def get_results_bih_dataset(seqs, labels, hdim, num_clusters):
    """Returns a dataframe of clustering results on the ECG dataset."""
    label_vocab, label_counts = np.unique(labels, return_counts=True)
    logging.info('Counts of labels in current run: %s',
                 str(label_vocab) + ' ' + str(label_counts))
    label_lookup = {l: i for i, l in enumerate(label_vocab)}
    cluster_ids = [label_lookup[l] for l in labels]
    model_fns = create_model_fns(hdim)
    padded = clustering.pad_seqs_to_matrix(seqs)
    max_seq_len = np.max([s.seq_len for s in seqs])
    pca = sklearn.decomposition.PCA(n_components=hdim).fit(
        _drop_nan_rows(padded))
    # pylint: disable=g-long-lambda
    model_fns['PCA'] = lambda s: pca.transform(
        _replace_nan_with_0(clustering.pad_seqs_to_matrix([s], max_seq_len))
    ).flatten()
    # Get clustering results.
    results_df = clustering.get_results(
        seqs,
        num_clusters,
        cluster_ids,
        None,
        model_fns,
        include_tslearn=FLAGS.include_tslearn,
        include_slow_methods=FLAGS.include_tslearn_slow)
    logging.info(results_df)
    if FLAGS.plot_clusters:
        clustering.visualize_clusters(
            seqs, None, labels, model_fns,
            os.path.join(FLAGS.output_dir, 'visualization.png'))
    return results_df
def get_results_bih_dataset(hdim,
                            num_clusters,
                            pos_samples=25,
                            neg_samples=25):
    """Returns ECG clustering results in a pandas DataFrame."""
    seqs, labels = parse_examples_to_seq(hdim, pos_samples, neg_samples)
    model_fns = create_model_fns(hdim)
    padded = clustering.pad_seqs_to_matrix(seqs)
    max_seq_len = np.max([s.seq_len for s in seqs])
    pca = sklearn.decomposition.PCA(n_components=hdim).fit(padded)
    # pylint: disable=g-long-lambda
    model_fns['PCA'] = lambda s: pca.transform(
        clustering.pad_seqs_to_matrix([s], max_seq_len)).flatten()
    # Get clustering results.
    results_df = clustering.get_results(seqs,
                                        num_clusters,
                                        labels,
                                        None,
                                        model_fns,
                                        include_tslearn=FLAGS.include_tslearn)
    print(results_df)
    clustering.visualize_clusters(
        seqs, None, labels, model_fns,
        os.path.join(FLAGS.output_dir, 'visualization.png'))
    return results_df
def get_results(cluster_center_eigvalues,
                cluster_center_dist_lower_bound,
                hidden_state_dim,
                input_dim,
                guessed_hidden_dim,
                num_clusters,
                guessed_num_clusters,
                min_seq_len,
                max_seq_len,
                num_sampled_seq_len,
                num_repeat,
                num_systems,
                cluster_radius,
                input_mean,
                input_stddev,
                output_noise_stddev,
                init_state_mean=0.0,
                init_state_stddev=0.0,
                generate_diagonalizable_only=False,
                random_seed=0,
                results_path=None):
    """Get results for varying sequence lengths.

  Args:
    cluster_center_eigvalues: List of lists of eigenvalues for each cluster.
      E.g. [[0.9,0.1], [0.5,0.1], [0.2,0.2], or None. If None, eigenvalues will
      be generated from uniform(-1,1) with respect to
      cluster_center_dist_lower_bound.
    cluster_center_dist_lower_bound: Desired distance lower bound between
      clusters. When generating cluster centers, try repeatedly until distance
      is greater than cluster_center_dist_lower_bound.
    hidden_state_dim: True hidden state dim.
    input_dim: The input dim.
    guessed_hidden_dim: Assumed hidden dim. If 0, use true hidden dim.
    num_clusters: True number of clusters.
    guessed_num_clusters: Desired number of clusters. If 0, use true number.
    min_seq_len: Min seq len in experiments.
    max_seq_len: Max seq len in experiments.
    num_sampled_seq_len: Number of sampled seq len values in between min and max
      seq len.
    num_repeat: Number of repeated experiments for each seq_len.
    num_systems: Number of dynamical system in each clustering experiments.
    cluster_radius: Expected distance of generated systems from cluster centers.
    input_mean: Scalar or 1D array of length hidden state dim.
    input_stddev: Scalar of 1D array of length hidden state dim.
    output_noise_stddev: Scalar.
    init_state_mean: Scalar or 1D array of length hidden state dim.
    init_state_stddev: Scalar of 1D array of length hidden state dim.
    random_seed: Random seed, integer.

  Returns:
    A pandas DataFrame with columns `method`, `seq_len`, `t_secs`,
    `failed_ratio`, and columns for clustering metrics such as `adj_mutual_info`
    and `v_measure`. The same method and seq_len will appear in num_repeat many
    rows.
  """
    if cluster_center_eigvalues is not None:
        if len(cluster_center_eigvalues) <= 1:
            raise ValueError('Need at least two cluster centers.')
        cluster_center_eigvalues = np.array(cluster_center_eigvalues)
        if cluster_center_eigvalues.shape != (num_clusters, hidden_state_dim):
            raise ValueError(
                'Cluter center eig has shape %s, expected (%d, %d).' %
                (str(cluster_center_eigvalues.shape), num_clusters,
                 hidden_state_dim))
    np.random.seed(random_seed)
    progress_bar = tqdm.tqdm(total=num_repeat * num_sampled_seq_len)
    # Generator for output sequences.
    gen = lds.SequenceGenerator(input_mean=input_mean,
                                input_stddev=input_stddev,
                                output_noise_stddev=output_noise_stddev,
                                init_state_mean=init_state_mean,
                                init_state_stddev=init_state_stddev)
    seq_len_vals = np.linspace(min_seq_len, max_seq_len, num_sampled_seq_len)
    seq_len_vals = [int(round(x)) for x in seq_len_vals]
    if guessed_hidden_dim == 0:
        guessed_hidden_dim = hidden_state_dim
    if guessed_num_clusters == 0:
        guessed_num_clusters = num_clusters
    results_dfs = []
    for i in xrange(num_repeat):
        logging.info('---Starting experiments in repeat run #%d---', i)
        if cluster_center_eigvalues is not None:
            cluster_centers = []
            for eig_val in cluster_center_eigvalues:
                c = lds.generate_linear_dynamical_system(hidden_state_dim,
                                                         input_dim,
                                                         eigvalues=eig_val)
                cluster_centers.append(c)
        else:
            cluster_centers = clustering.generate_cluster_centers(
                num_clusters,
                hidden_state_dim,
                input_dim,
                cluster_center_dist_lower_bound,
                diagonalizable=generate_diagonalizable_only)
        true_systems, true_cluster_ids = clustering.generate_lds_clusters(
            cluster_centers,
            num_systems,
            cluster_radius,
            diagonalizable=generate_diagonalizable_only)
        for seq_len in seq_len_vals:
            logging.info('Running experiment with seq_len = %d.', seq_len)
            seqs = [gen.generate_seq(s, seq_len=seq_len) for s in true_systems]
            # Create transform_fns.
            model_fns = create_model_fns(guessed_hidden_dim)
            pca = sklearn.decomposition.PCA(
                n_components=guessed_hidden_dim).fit(
                    np.stack([s.outputs.flatten() for s in seqs], axis=0))
            model_fns['PCA'] = _create_pca_model_fn(pca)
            transform_fns = collections.OrderedDict()
            for k in model_fns:
                transform_fns[k] = _compose_model_fn(model_fns[k])
            # Get clustering results.
            results_df = clustering.get_results(
                seqs,
                guessed_num_clusters,
                true_cluster_ids,
                true_systems,
                transform_fns,
                FLAGS.include_tslearn,
                include_slow_methods=FLAGS.include_tslearn_slow)
            results_df['seq_len'] = seq_len
            results_df['n_guessed_clusters'] = guessed_num_clusters
            results_df['n_true_clusters'] = num_clusters
            results_df['true_hidden_dim'] = hidden_state_dim
            results_df['guessed_hidden_dim'] = guessed_hidden_dim
            results_dfs.append(results_df)
            logging.info('Results:\n%s', str(results_df))
            plot_filepath = os.path.join(
                FLAGS.output_dir,
                'cluster_visualization_run_%d_seq_len_%d.png' % (i, seq_len))
            if FLAGS.plot_clusters:
                clustering.visualize_clusters(seqs, true_systems,
                                              true_cluster_ids, transform_fns,
                                              plot_filepath)
            progress_bar.update(1)
        if results_path:
            with open(results_path, 'w+') as f:
                pd.concat(results_dfs).to_csv(f, index=False)
    progress_bar.close()
    return pd.concat(results_dfs)