コード例 #1
0
ファイル: scan.py プロジェクト: JohnReid/STEME
def create_html_output(dataset_name, motifs, occurrences, by_motif, seq_infos,
                       options):
    """Create HTML output.
    """
    from jinja2 import Environment, PackageLoader
    env = Environment(loader=PackageLoader('stempy', 'templates'))
    template = env.get_template('scan-stats.html')

    # copy the static info
    static_dir = os.path.join(options.results_dir, 'static')
    html_copy_static(static_dir)

    # write the HTML
    filename = os.path.join(options.results_dir, 'scan-stats.html')
    logger.info('Writing STEME scan statistics as HTML to %s', filename)
    num_bases = sum(info.length for info in seq_infos)
    with open(filename, 'w') as f:
        variables = {
            'dataset_name': dataset_name,
            'num_sites': len(occurrences),
            'num_motifs': len(motifs),
            'num_seqs': len(seq_infos),
            'num_bases': num_bases,
            'options': options,
            'num_seq_clusters': num_seq_clusters(len(seq_infos)),
        }
        f.write(template.render(**variables))

    # create the figures
    if len(occurrences):
        with pylab_context_ioff():
            create_figures(motifs, occurrences, by_motif, seq_infos, options)
コード例 #2
0
def dpmeans(x, lambda_, progress_plots=False):
    """
    The DP-means algorithm by `Kulis et al.`_
    
    .. _Kulis et al.: http://arxiv.org/abs/1111.0352    
    
    
    
    :parameters:
    
    - :math:`x` : input data, a sequence of length :math:`N`
    - :math:`\lambda` : cluster penalty parameter
    - progress_plots : Scatter plot clusters at every iteration
    
    :returns: Cluster indicator variables
    

    Algorithm:
    
    #. Initialise:
      
      * Number of clusters :math:`K=1`
      * Global cluster mean :math:`\\mu_1 = \\frac{1}{n} \\sum_n x_n`
      * Cluster indicator variables :math:`z_n = 0 \\quad \\forall n`
      
    #. Repeat until convergence:
    
      * For each point :math:`n`:

        - Compute distance to each cluster :math:`d_{nk} = ||x_n - \\mu_k||^2`
        - If :math:`\\min d_{nk} > \lambda` then set :math:`K=K+1, z_n=K, \\mu_k=x_n`
        - Otherwise set :math:`z_n= \\arg\!\\min_k d_{nk}`

      * For each cluster :math:`k`, compute :math:`\\mu_k = \\frac{1}{|\{n: z_n = k\}|}\sum_{n: z_n = k} x_n`
    
    """
    if progress_plots:
        import pylab as P
        from cookbook.pylab_utils import pylab_context_ioff, \
            create_format_cycler, simple_marker_styles, simple_colours
        format_cycler = create_format_cycler(marker=simple_marker_styles, color=simple_colours)
    
    N = len(x)
    logging.info('Got %d data', N)
    lambda2 = lambda_ ** 2
    z = numpy.zeros(N, dtype=numpy.int) # initialise cluster indicators
    last_z = None
    for i in count(1):
    #for i in xrange(1,21):
        logging.info('Iteration %d: have %d cluster(s)', i, int(z.max() + 1))
        
        # calculate cluster means
        mu = [numpy.mean(x[z==k], axis=0) for k in xrange(int(z.max() + 1))]

        for n, xn in enumerate(x):
            d2 = numpy.array([((xn - muk)**2).sum() for muk in mu])
            closest_k = d2.argmin()
            if d2[closest_k] > lambda2:
                mu.append(xn)
            else:
                z[n] = closest_k
        
        # make clusters contiguous from 0
        cluster_map = dict((c, k) for k, c in enumerate(set(z)))
        if len(cluster_map) < int(z.max() + 1):
            logging.warning('Reducing cluster indices')
        for n, c in enumerate(z):
            z[n] = cluster_map[c]
        
        # check if we have converged by testing if no z have changed
        if None != last_z and (z == last_z).all():
            break
        last_z = z.copy()
        
        if progress_plots:
            with pylab_context_ioff():
                P.figure()
                plot_clusters(x, z, format_cycler)
                P.savefig('dpmeans-%04d.png' % i)
                P.close()
    
    num_clusters = len(set(z))
    logging.info('Have %d cluster(s)', num_clusters)
    return z