Exemple #1
0
def run_one(path):
    cells = [g for g in generate_unit_cells_from_text(path)]
    g6 = [SingleFrame.make_g6(u) for u in cells]

    # for the purpose of this test, cycle through pairs of g6 vectors
    for ix in xrange(len(g6) - 1):
        a = g6[ix]
        b = g6[ix + 1]
        old = NCDist(a, b)
        new = NCDist2017(a, b)
        com = NCDist2017(b, a)
        assert old == new
        assert new == com
def run_one(path):
    cells = [g for g in generate_unit_cells_from_text(path)]
    g6 = [SingleFrame.make_g6(u) for u in cells]

    # for the purpose of this test, cycle through pairs of g6 vectors
    for ix in range(len(g6) - 1):
        a = g6[ix]
        b = g6[ix + 1]
        old = NCDist(a, b)
        # workaround allows use of non-thread-safe NCDist, even if openMP is enabled elsewhere in the Python program
        import os, omptbx
        workaround_nt = int(os.environ.get("OMP_NUM_THREADS", 1))
        omptbx.omp_set_num_threads(1)
        new = NCDist2017(a, b)
        com = NCDist2017(b, a)
        omptbx.omp_set_num_threads(workaround_nt)
        assert old == new, "Zeldin, AB2017"
        assert new == com, "Pair %d NCDist(a,b) %f != NCDist(b,a) %f" % (
            ix, new, com)
Exemple #3
0
    def ab_cluster(self,
                   threshold=10000,
                   method='distance',
                   linkage_method='single',
                   log=False,
                   ax=None,
                   write_file_lists=True,
                   schnell=False,
                   doplot=True,
                   labels='default'):
        """
    Hierarchical clustering using the unit cell dimentions.

    :param threshold: the threshold to use for prunning the tree into clusters.
    :param method: which clustering method from scipy to use when creating the tree (see scipy.cluster.hierarchy)
    :param linkage_method: which linkage method from scipy to use when creating the linkages. x (see scipy.cluster.hierarchy)
    :param log: if True, use log scale on y axis.
    :param ax: if a matplotlib axes object is provided, plot to this. Otherwise, create a new axes object and display on screen.
    :param write_file_lists: if True, write out the files that make up each cluster.
    :param schnell: if True, use simple euclidian distance, otherwise, use Andrews-Berstein distance from Andrews & Bernstein J Appl Cryst 47:346 (2014) on the Niggli cells.
    :param doplot: Boolean flag for if the plotting should be done at all.
    Runs faster if switched off.
    :param labels: 'default' will not display any labels for more than 100 images, but will display file names for fewer. This can be manually overidden with a boolean flag.
    :return: A list of Clusters ordered by largest Cluster to smallest

    .. note::
      Use 'schnell' option with caution, since it can cause strange behaviour
      around symmetry boundaries.
    """

        logging.info("Hierarchical clustering of unit cells")
        import scipy.spatial.distance as dist
        import scipy.cluster.hierarchy as hcluster

        # 1. Create a numpy array of G6 cells
        g6_cells = np.array(
            [SingleFrame.make_g6(image.uc) for image in self.members])

        # 2. Do hierarchichal clustering, using the find_distance method above.
        if schnell:
            logging.info("Using Euclidean distance")
            pair_distances = dist.pdist(g6_cells, metric='euclidean')
            logging.info("Distances have been calculated")
            this_linkage = hcluster.linkage(pair_distances,
                                            method=linkage_method,
                                            metric='euclidean')
        else:
            logging.info(
                "Using Andrews-Bernstein distance from Andrews & Bernstein "
                "J Appl Cryst 47:346 (2014)")
            pair_distances = dist.pdist(g6_cells,
                                        metric=lambda a, b: NCDist(a, b))
            logging.info("Distances have been calculated")
            this_linkage = hcluster.linkage(pair_distances,
                                            method=linkage_method,
                                            metric=lambda a, b: NCDist(a, b))
        cluster_ids = hcluster.fcluster(this_linkage,
                                        threshold,
                                        criterion=method)
        logging.debug("Clusters have been calculated")

        # 3. Create an array of sub-cluster objects from the clustering
        sub_clusters = []
        for cluster in range(max(cluster_ids)):
            info_string = ('Made using ab_cluster with t={},'
                           ' {} method, and {} linkage').format(
                               threshold, method, linkage_method)
            sub_clusters.append(
                self.make_sub_cluster([
                    self.members[i] for i in range(len(self.members))
                    if cluster_ids[i] == cluster + 1
                ], 'cluster_{}'.format(cluster + 1), info_string))

        sub_clusters = sorted(sub_clusters, key=lambda x: len(x.members))
        # Rename to order by size
        for num, cluster in enumerate(sub_clusters):
            cluster.cname = 'cluster_{}'.format(num + 1)

        # 3.5 optionally write out the clusters to files.
        if write_file_lists:
            for cluster in sub_clusters:
                if len(cluster.members) > 1:
                    cluster.dump_file_list(
                        out_file_name="{}.lst".format(cluster.cname))

        if doplot:
            if labels is True:
                labels = [image.name for image in self.members]
            elif labels is False:
                labels = ['' for _ in self.members]
            elif labels == 'default':
                if len(self.members) > 100:
                    labels = ['' for _ in self.members]
                else:
                    labels = [image.name for image in self.members]
            else:
                labels = [getattr(v, labels, '') for v in self.members]

            # 4. Plot a dendogram to the axes if no axis is passed, otherwise just
            #    return the axes object
            if ax is None:
                fig = plt.figure("Distance Dendogram")
                ax = fig.gca()
                direct_visualisation = True
            else:
                direct_visualisation = False

            hcluster.dendrogram(this_linkage,
                                labels=labels,
                                leaf_font_size=8,
                                leaf_rotation=90.0,
                                color_threshold=threshold,
                                ax=ax)

            if log:
                ax.set_yscale("symlog", linthreshx=(-1, 1))
            else:
                ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1])

            if direct_visualisation:
                fig.savefig("{}_dendogram.pdf".format(self.cname))
                plt.show()

        return sub_clusters, ax
Exemple #4
0
    def ab_cluster(
        self,
        threshold=10000,
        method="distance",
        linkage_method="single",
        log=False,
        ax=None,
        write_file_lists=True,
        schnell=False,
        doplot=True,
        labels="default",
    ):
        """
        Hierarchical clustering using the unit cell dimentions.

        :param threshold: the threshold to use for prunning the tree into clusters.
        :param method: which clustering method from scipy to use when creating the tree (see scipy.cluster.hierarchy)
        :param linkage_method: which linkage method from scipy to use when creating the linkages. x (see scipy.cluster.hierarchy)
        :param log: if True, use log scale on y axis.
        :param ax: if a matplotlib axes object is provided, plot to this.
                   Otherwise, create a new axes object and display on screen.
        :param write_file_lists: if True, write out the files that make up each cluster.
        :param schnell: if True, use simple euclidian distance, otherwise, use Andrews-Bernstein
                        distance from Andrews & Bernstein J Appl Cryst 47:346 (2014) on the Niggli cells.
        :param doplot: Boolean flag for if the plotting should be done at all.
                       Runs faster if switched off.
        :param labels: 'default' will not display any labels for more than 100 images, but will display
                       file names for fewer. This can be manually overidden with a boolean flag.
        :return: A list of Clusters ordered by largest Cluster to smallest

        .. note::
          Use 'schnell' option with caution, since it can cause strange behaviour
          around symmetry boundaries.
        """

        import numpy as np

        from cctbx.uctbx.determine_unit_cell import NCDist
        from xfel.clustering.singleframe import SingleFrame

        logger.info("Hierarchical clustering of unit cells")
        import scipy.cluster.hierarchy as hcluster
        import scipy.spatial.distance as dist

        # 1. Create a numpy array of G6 cells
        g6_cells = np.array([SingleFrame.make_g6(image.uc) for image in self.members])

        # 2. Do hierarchichal clustering, using the find_distance method above.
        if schnell:
            logger.info("Using Euclidean distance")
            metric = "euclidean"
        else:
            logger.info(
                "Using Andrews-Bernstein distance from Andrews & Bernstein "
                "J Appl Cryst 47:346 (2014)"
            )
            metric = NCDist
        pair_distances = dist.pdist(g6_cells, metric=metric)
        if len(pair_distances) > 0:
            logger.info("Distances have been calculated")
            this_linkage = hcluster.linkage(
                pair_distances, method=linkage_method, metric=metric
            )
            cluster_ids = hcluster.fcluster(this_linkage, threshold, criterion=method)
            logger.debug("Clusters have been calculated")
        else:
            logger.debug("No distances were calculated. Aborting clustering.")
            return [], None

        # 3. Create an array of sub-cluster objects from the clustering
        sub_clusters = []
        for cluster in range(max(cluster_ids)):
            info_string = f"Made using ab_cluster with t={threshold}, {method} method, and {linkage_method} linkage"
            sub_clusters.append(
                self.make_sub_cluster(
                    [
                        self.members[i]
                        for i in range(len(self.members))
                        if cluster_ids[i] == cluster + 1
                    ],
                    f"cluster_{cluster + 1}",
                    info_string,
                )
            )

        sub_clusters = sorted(sub_clusters, key=lambda x: len(x.members))
        # Rename to order by size
        for num, cluster in enumerate(sub_clusters):
            cluster.cname = f"cluster_{num + 1}"

        # 3.5 optionally write out the clusters to files.
        if write_file_lists:
            for cluster in sub_clusters:
                if len(cluster.members) > 1:
                    cluster.dump_file_list(out_file_name=f"{cluster.cname}.lst")

        if labels is True:
            labels = [image.name for image in self.members]
        elif labels is False:
            labels = ["" for _ in self.members]
        elif labels == "default":
            if len(self.members) > 100:
                labels = ["" for _ in self.members]
            else:
                labels = [image.name for image in self.members]
        else:
            labels = [getattr(v, labels, "") for v in self.members]

        if doplot:
            import matplotlib.pyplot as plt

            # 4. Plot a dendogram to the axes if no axis is passed, otherwise just
            #    return the axes object
            if ax is None:
                fig = plt.figure("Distance Dendogram")
                ax = fig.gca()
                direct_visualisation = True
            else:
                direct_visualisation = False

        dendrogram = hcluster.dendrogram(
            this_linkage,
            labels=labels,
            p=200,
            truncate_mode="lastp",  # show only the last p merged clusters
            leaf_font_size=8,
            leaf_rotation=90.0,
            color_threshold=threshold,
            ax=ax,
            no_plot=not doplot,
        )

        if doplot:
            if log:
                ax.set_yscale("symlog", linthreshx=(-1, 1))
            else:
                ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1])

            if direct_visualisation:
                fig.savefig(f"{self.cname}_dendogram.pdf")
                plt.show()

        return sub_clusters, dendrogram, ax
Exemple #5
0
  def ab_cluster(self, threshold=10000, method='distance',
                 linkage_method='single', log=False,
                 ax=None, write_file_lists=True, schnell=False, doplot=True,
                 labels='default'):
    """
    Hierarchical clustering using the unit cell dimentions.

    :param threshold: the threshold to use for prunning the tree into clusters.
    :param method: which clustering method from scipy to use when creating the tree (see scipy.cluster.hierarchy)
    :param linkage_method: which linkage method from scipy to use when creating the linkages. x (see scipy.cluster.hierarchy)
    :param log: if True, use log scale on y axis.
    :param ax: if a matplotlib axes object is provided, plot to this. Otherwise, create a new axes object and display on screen.
    :param write_file_lists: if True, write out the files that make up each cluster.
    :param schnell: if True, use simple euclidian distance, otherwise, use Andrews-Berstein distance from Andrews & Bernstein J Appl Cryst 47:346 (2014) on the Niggli cells.
    :param doplot: Boolean flag for if the plotting should be done at all.
    Runs faster if switched off.
    :param labels: 'default' will not display any labels for more than 100 images, but will display file names for fewer. This can be manually overidden with a boolean flag.
    :return: A list of Clusters ordered by largest Cluster to smallest

    .. note::
      Use 'schnell' option with caution, since it can cause strange behaviour
      around symmetry boundaries.
    """

    logging.info("Hierarchical clustering of unit cells")
    import scipy.spatial.distance as dist
    import scipy.cluster.hierarchy as hcluster

    # 1. Create a numpy array of G6 cells
    g6_cells = np.array([SingleFrame.make_g6(image.uc)
                         for image in self.members])

    # 2. Do hierarchichal clustering, using the find_distance method above.
    if schnell:
      logging.info("Using Euclidean distance")
      pair_distances = dist.pdist(g6_cells, metric='euclidean')
      logging.info("Distances have been calculated")
      this_linkage = hcluster.linkage(pair_distances,
                                      method=linkage_method,
                                      metric='euclidean')
    else:
      logging.info("Using Andrews-Bernstein distance from Andrews & Bernstein "
                   "J Appl Cryst 47:346 (2014)")
      pair_distances = dist.pdist(g6_cells,
                                metric=lambda a, b: NCDist(a, b))
      logging.info("Distances have been calculated")
      this_linkage = hcluster.linkage(pair_distances,
                                      method=linkage_method,
                                      metric=lambda a, b: NCDist(a, b))
    cluster_ids = hcluster.fcluster(this_linkage,
                                    threshold,
                                    criterion=method)
    logging.debug("Clusters have been calculated")

    # 3. Create an array of sub-cluster objects from the clustering
    sub_clusters = []
    for cluster in range(max(cluster_ids)):
      info_string = ('Made using ab_cluster with t={},'
                     ' {} method, and {} linkage').format(threshold,
                                                          method,
                                                          linkage_method)
      sub_clusters.append(self.make_sub_cluster([self.members[i]
                                                 for i in
                                                 range(len(self.members))
                                                 if
                                                 cluster_ids[i] == cluster + 1],
                                                'cluster_{}'.format(
                                                  cluster + 1),
                                                info_string))

    sub_clusters = sorted(sub_clusters, key=lambda x: len(x.members))
    # Rename to order by size
    for num, cluster in enumerate(sub_clusters):
      cluster.cname = 'cluster_{}'.format(num + 1)

    # 3.5 optionally write out the clusters to files.
    if write_file_lists:
      for cluster in sub_clusters:
        if len(cluster.members) > 1:
          cluster.dump_file_list(out_file_name="{}.lst".format(cluster.cname))

    if doplot:
      if labels is True:
        labels = [image.name for image in self.members]
      elif labels is False:
        labels = ['' for _ in self.members]
      elif labels == 'default':
        if len(self.members) > 100:
          labels = ['' for _ in self.members]
        else:
          labels = [image.name for image in self.members]
      else:
         labels = [getattr(v, labels, '') for v in self.members]

      # 4. Plot a dendogram to the axes if no axis is passed, otherwise just
      #    return the axes object
      if ax is None:
        fig = plt.figure("Distance Dendogram")
        ax = fig.gca()
        direct_visualisation = True
      else:
        direct_visualisation = False

      hcluster.dendrogram(this_linkage,
                          labels=labels,
                          leaf_font_size=8, leaf_rotation=90.0,
                          color_threshold=threshold, ax=ax)

      if log:
        ax.set_yscale("symlog", linthreshx=(-1,1))
      else:
        ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1])

      if direct_visualisation:
        fig.savefig("{}_dendogram.pdf".format(self.cname))
        plt.show()

    return sub_clusters, ax