def plot_clusters(x, z, format_cycler=None): import pylab as P from cookbook.pylab_utils import create_format_cycler, simple_marker_styles, simple_colours if not format_cycler: format_cycler = create_format_cycler(marker=simple_marker_styles, color=simple_colours) clusters = set(z) for i, c in enumerate(clusters): cluster_x = x[z==c] P.scatter(cluster_x[:,0], cluster_x[:,1], **format_cycler(i))
def dpmeans(x, lambda_, progress_plots=False): """ The DP-means algorithm by `Kulis et al.`_ .. _Kulis et al.: http://arxiv.org/abs/1111.0352 :parameters: - :math:`x` : input data, a sequence of length :math:`N` - :math:`\lambda` : cluster penalty parameter - progress_plots : Scatter plot clusters at every iteration :returns: Cluster indicator variables Algorithm: #. Initialise: * Number of clusters :math:`K=1` * Global cluster mean :math:`\\mu_1 = \\frac{1}{n} \\sum_n x_n` * Cluster indicator variables :math:`z_n = 0 \\quad \\forall n` #. Repeat until convergence: * For each point :math:`n`: - Compute distance to each cluster :math:`d_{nk} = ||x_n - \\mu_k||^2` - If :math:`\\min d_{nk} > \lambda` then set :math:`K=K+1, z_n=K, \\mu_k=x_n` - Otherwise set :math:`z_n= \\arg\!\\min_k d_{nk}` * For each cluster :math:`k`, compute :math:`\\mu_k = \\frac{1}{|\{n: z_n = k\}|}\sum_{n: z_n = k} x_n` """ if progress_plots: import pylab as P from cookbook.pylab_utils import pylab_context_ioff, \ create_format_cycler, simple_marker_styles, simple_colours format_cycler = create_format_cycler(marker=simple_marker_styles, color=simple_colours) N = len(x) logging.info('Got %d data', N) lambda2 = lambda_ ** 2 z = numpy.zeros(N, dtype=numpy.int) # initialise cluster indicators last_z = None for i in count(1): #for i in xrange(1,21): logging.info('Iteration %d: have %d cluster(s)', i, int(z.max() + 1)) # calculate cluster means mu = [numpy.mean(x[z==k], axis=0) for k in xrange(int(z.max() + 1))] for n, xn in enumerate(x): d2 = numpy.array([((xn - muk)**2).sum() for muk in mu]) closest_k = d2.argmin() if d2[closest_k] > lambda2: mu.append(xn) else: z[n] = closest_k # make clusters contiguous from 0 cluster_map = dict((c, k) for k, c in enumerate(set(z))) if len(cluster_map) < int(z.max() + 1): logging.warning('Reducing cluster indices') for n, c in enumerate(z): z[n] = cluster_map[c] # check if we have converged by testing if no z have changed if None != last_z and (z == last_z).all(): break last_z = z.copy() if progress_plots: with pylab_context_ioff(): P.figure() plot_clusters(x, z, format_cycler) P.savefig('dpmeans-%04d.png' % i) P.close() num_clusters = len(set(z)) logging.info('Have %d cluster(s)', num_clusters) return z
def create_figures(motifs, occs, by_motif, seq_infos, options): """Create figures. """ from stempy import ensure_dir_exists ensure_dir_exists(os.path.join(options.results_dir, 'scan-stats')) # Size of figlegend if len(motifs) > 30: size = 6 elif len(motifs) > 16: size = 8 elif len(motifs) > 10: size = 10 else: size = 12 figlegendprops = {'size': size} # Format cycler for line plots format_cycler = create_format_cycler( linestyle=['--', '-.', '-', ':'], c=("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) # Format cycler for marker plots # format_cycler_marker = create_format_cycler( # marker=simple_marker_styles, # c=("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", # "#D55E00", "#CC79A7")) # Scan scores pylab.figure(figsize=(6, 4)) lines = plot_scores_per_motif(motifs, by_motif, format_cycler) savefig('scan-scores', options) pylab.close() # Scan legend pylab.figure(figsize=(4.25, 4)) pylab.figlegend(lines, motifs, 'center', prop=figlegendprops) savefig('scan-legend', options) pylab.close() # Best Z for each motif/sequence combination pylab.figure(figsize=(6, 4)) best_Z = calculate_motif_best_Z_per_sequence( motifs, by_motif, len(seq_infos)) plot_best_Z(motifs, best_Z) savefig('scan-best-Z', options) pylab.close() # Scan motif cooccurrences pylab.figure(figsize=(6, 4)) # pylab.figlegend(lines, motifs, 'center') plot_collinearity(motifs, best_Z) savefig('scan-collinearity', options) pylab.close() # Scan positions pylab.figure(figsize=(6, 4)) lines = plot_site_positions(motifs, occs, by_motif, seq_infos, format_cycler) savefig('scan-positions', options) pylab.close() # Scan legend with all pylab.figure(figsize=(4.25, 4)) pylab.figlegend( lines, ['ALL MOTIFS'] + motifs, 'center', prop=figlegendprops) savefig('scan-legend-with-all', options) pylab.close() # Sequence coverage pylab.figure(figsize=(6, 4)) plot_seq_coverage(best_Z, format_cycler) savefig('scan-seq-coverage', options) pylab.close() # Scan sequences pylab.figure(figsize=(6, 4)) lines = plot_seq_distribution(motifs, by_motif, seq_infos, format_cycler) savefig('scan-sequences', options) pylab.close() # Scan legend with markers # fig = pylab.figure(figsize=(4.25, 4)) # pylab.figlegend(lines, motifs, 'center', prop=figlegendprops) # savefig('scan-legend-marker', options) # pylab.close() # Scan lengths pylab.figure(figsize=(6, 4)) plot_seq_lengths(seq_infos) savefig('scan-lengths', options) pylab.close() # Scan occurrences by motif pylab.figure(figsize=(6, len(by_motif) / 4.)) pylab.subplots_adjust(left=.3, bottom=.1, right=.96, top=.98) plot_occs_by_motif(by_motif) savefig('scan-occs-by-motif', options) pylab.close()