Exemple #1
0
def cluster_events(
    events,
    cutoff=10,
    linkage='average',
):
    if len(events) == 1:
        return SiteList([Site(events=events, id=1).apply_parentage()])

    centroids = [e.cluster.centroid for e in events]
    cluster_ids = scipy.cluster.hierarchy.fclusterdata(
        X=centroids,
        t=cutoff,
        criterion='distance',
        metric='euclidean',
        method=linkage,
    )
    cluster_ids = list(cluster_ids)

    sites = []
    for s_idx, e_idxs in generate_group_idxs(cluster_ids):
        assert s_idx > 0
        new_site = Site([events[i] for i in e_idxs],
                        id=s_idx).apply_parentage()
        sites.append(new_site)

    return SiteList(sites)
Exemple #2
0
    def cluster_high_z_values(self, z_map_data, point_mask_idx):
        """Finds all the points in the z-map above `z_cutoff`, points will then be clustered into groups of cutoff `clustering_cutoff` angstroms"""

        # Select these values from the map
        point_mask_idx = flex.size_t(point_mask_idx)
        point_mask_val = z_map_data.select(point_mask_idx)
        # Find values above cutoff
        if self.params.negative_values:
            above_idx = (point_mask_val >=
                         self.params.contour_level).iselection()
            below_idx = (point_mask_val <=
                         -1.0 * self.params.contour_level).iselection()
            sel_idx = above_idx.concatenate(below_idx)
        else:
            sel_idx = (point_mask_val >=
                       self.params.contour_level).iselection()
        # Extract values and grid points for these sites
        above_val = point_mask_val.select(sel_idx)
        above_idx = point_mask_idx.select(sel_idx)
        above_gps = flex.vec3_double(
            [idx_to_grid(i, grid_size=z_map_data.all()) for i in above_idx])
        above_len = len(above_val)

        # No Cluster points found
        if above_len == 0:
            return 0, []
        # One Cluster point found
        elif above_len == 1:
            return 1, [(above_gps, above_val)]
        # Can't cluster if there are too many points
        elif above_len > 10000:
            return -1, [(above_gps, above_val)]
        # Cluster points if we have found them
        else:
            self.log('> Clustering {!s} Points.'.format(above_len))
            # Cluster the extracted points
            t1 = time.time()
            cluster_ids = scipy.cluster.hierarchy.fclusterdata(
                X=above_gps,
                t=self.grid_clustering_cutoff,
                criterion='distance',
                metric='euclidean',
                method='single')
            cluster_ids = list(cluster_ids)
            t2 = time.time()
            self.log('> Clustering > Time Taken: {!s} seconds'.format(
                int(t2 - t1)))

            # Get the number of clusters
            num_clusters = max(cluster_ids)
            # Group the values by cluster id
            z_clusters = []
            for c_id, c_idxs in generate_group_idxs(cluster_ids):
                c_idxs = flex.size_t(c_idxs)
                c_gps = above_gps.select(c_idxs)
                c_val = above_val.select(c_idxs)
                z_clusters.append((c_gps, c_val))
            assert num_clusters == len(z_clusters)
            return num_clusters, z_clusters
Exemple #3
0
def group_clusters(cluster_ids, num_clusters, above_gps, above_val):
    # Group the values by cluster id
    z_clusters = []
    for c_id, c_idxs in generate_group_idxs(cluster_ids):
        c_idxs = flex.size_t(c_idxs)
        c_gps = above_gps.select(c_idxs)
        c_val = above_val.select(c_idxs)
        z_clusters.append((c_gps, c_val))
    assert num_clusters == len(z_clusters)

    return z_clusters
Exemple #4
0
    def by_unit_cell(cls, crystals, method='lcv', cutoff=0.5):
        """Cluster crystals by unit cell and return CrystalGroup for each cluster"""
        if len(crystals) == 1: return [cls(crystals)]
        assert method in ['lcv'], 'method not recognised'
#        # Method 1
#        if   method == 'lcv': link_func = lambda a,b: lcv_from_unit_cells(a.unit_cell, b.unit_cell)
#        hierarchy = libtbx.cluster.HierarchicalClustering(crystals, link_func)
#        clusters = hierarchy.getlevel(cutoff)
#        return [cls(c) for c in clusters]
        # Method 2
        if   method == 'lcv': link_func = pairwise_lcv
        dist_mat = link_func(unit_cells=[c.unit_cell for c in crystals])
        link_mat = scipy.cluster.hierarchy.linkage(dist_mat, method='single', metric='euclidean')
        clusters = scipy.cluster.hierarchy.fcluster(link_mat, t=cutoff, criterion='distance')
        return [cls([crystals[idx] for idx in g]) for i_g,g in generate_group_idxs(clusters)]
Exemple #5
0
    def group_clusters(self, z_clusters, separation_cutoff=5):
        """Join clusters that are separated by less than max_separation"""

        if len(z_clusters) == 1:
            return 1, z_clusters
        else:
            self.log('----------------------------------->>>')
            self.log('Grouping Nearby Clusters')

        # Minimum distance between grid points to be joined (squared)
        grid_cutoff_sq = (separation_cutoff / self.grid_spacing)**2

        # Record which clusters are to be joined
        connect_array = numpy.zeros((len(z_clusters), len(z_clusters)),
                                    dtype=int)
        for i_clust_1, (c_gps_1, c_val_1) in enumerate(z_clusters):
            for i_clust_2, (c_gps_2, c_val_2) in enumerate(z_clusters):
                # Skip if this is the same blob
                if i_clust_1 == i_clust_2:
                    connect_array[(i_clust_1, i_clust_2)] = 1
                    continue
                # Extract the minimum separation of the grid points
                min_dist_sq = min(
                    [min((c_gps_2 - gp).dot()) for gp in c_gps_1])
                # Check to see if they should be joined
                if min_dist_sq < grid_cutoff_sq:
                    connect_array[(i_clust_1, i_clust_2)] = 1
        # Cluster the connection array
        cluster_groupings = find_connected_groups(
            connection_matrix=connect_array)
        # Concatenate smaller clusters into larger clusters
        grouped_clusters = []
        for g_id, g_idxs in generate_group_idxs(cluster_groupings):
            g_gps = []
            [g_gps.extend(z_clusters[i][0]) for i in g_idxs]
            g_gps = flex.vec3_double(g_gps)
            g_val = []
            [g_val.extend(z_clusters[i][1]) for i in g_idxs]
            g_val = flex.double(g_val)
            grouped_clusters.append((g_gps, g_val))

        assert len(grouped_clusters) == max(cluster_groupings)

        self.log('Grouped {!s} Clusters together to form {!s} Clusters'.format(
            len(z_clusters), len(grouped_clusters)))
        return len(grouped_clusters), grouped_clusters
Exemple #6
0
    def filter_z_clusters_3(self, z_clusters, dataset, max_contact_dist=8):
        """Find and remove symmetry equivalent clusters"""

        if len(z_clusters) == 1:
            return 1, z_clusters
        else:
            self.log('----------------------------------->>>')
            self.log('Filtering symmetry equivalent clusters')

        # Extract the protein sites in the reference frame
        d_sites_cart = protein(dataset.model.hierarchy).atoms().extract_xyz()
        d_unit_cell = dataset.model.unit_cell
        d_sym_ops = dataset.model.crystal_contact_operators()

        # Cartesianise and fractionalise the points in each of the clusters (in the crystallographic frame)
        points_cart = [None] * len(z_clusters)
        points_frac = [None] * len(z_clusters)
        for c_idx, (c_gps, c_val) in enumerate(z_clusters):
            # Extract points in cluster
            points_cart[c_idx] = dataset.model.alignment.ref2nat(
                self.grid.grid2cart(c_gps))
            # Fractionalise them to the unit cell of the dataset
            points_frac[c_idx] = d_unit_cell.fractionalize(points_cart[c_idx])
        # Find the sets of clusters that are symmetry related
        sym_equiv_groups = find_symmetry_equivalent_groups(
            points_frac=points_frac,
            sym_ops=d_sym_ops,
            unit_cell=d_unit_cell,
            cutoff_cart=1.05 * 1.7321 * self.grid_spacing)
        # max_contact_dist - a point contacts an atom if the atoms is within this distance of it
        # Save time - calculate the square of the contact distance
        max_contact_dist_sq = max_contact_dist**2
        # Iterate through and chose one from each group to keep
        filt_z_clusters = []
        for g_id, g_idxs in generate_group_idxs(sym_equiv_groups):
            # Count the number of contact for each cluster in the group
            c_contacts = []
            # Iterate through cluster in the group
            for c_idx in g_idxs:
                # Initialise contact counter
                contacts = 0
                # Get the cartesian points for the cluster
                c_points_cart = points_cart[c_idx]
                # Again, use the brute force all-v-all method
                for rp in d_sites_cart:
                    diffs_cart = c_points_cart - rp
                    # Check to see if site closer to cluster than minimum
                    if min(diffs_cart.dot()) < max_contact_dist_sq:
                        contacts += 1
                # Record the number of contacts (over size of cluster)
                c_contacts.append(1.0 * contacts / len(c_points_cart))
#                if self.log.verbose:
#                    print('CLUSTER:', c_idx, ', CONTACTS PER POINT:', round(c_contacts[-1],3))

# Find the cluster with the most contacts
            max_contacts = max(c_contacts)
            if max_contacts == 0:
                raise Exception('MAX CONTACTS IS 0!')
            else:
                cluster_to_keep = g_idxs[c_contacts.index(max_contacts)]
                filt_z_clusters.append(z_clusters[cluster_to_keep])
#                if self.log.verbose:
#                    print('KEEPING CLUSTER', cluster_to_keep)
        assert len(filt_z_clusters) == max(
            sym_equiv_groups
        ), 'NUMBER OF UNIQUE GROUPS AND GROUPS TO BE RETURNED NOT THE SAME'

        self.log('Filtered {!s} Clusters to {!s} Clusters'.format(
            len(z_clusters), len(filt_z_clusters)))
        return len(filt_z_clusters), filt_z_clusters
Exemple #7
0
def run(params):

    assert params.input.pdb, 'No pdb files provided'
    # REMOVE THIS WHEN IMPLEMENTED
    assert params.selection.res_names
    # REMOVE THIS WHEN IMPLEMENTED
    if params.selection.res_names:
        params.selection.__inject__("res_names_list",
                                    params.selection.res_names.split(','))
    else:
        params.selection.__inject__("res_names_list", None)

    output_dir, images_dir = prepare_output_directory(params)
    scores_file = os.path.join(output_dir, 'residue_scores.csv')

    all_data = pandas.DataFrame()

    for pdb in params.input.pdb:

        mtz = pdb.replace('.pdb', '.mtz')

        if params.input.labels == 'basename':
            label = os.path.split_ext(os.path.basename(pdb))[0]
        elif params.input.labels == 'folder_name':
            label = os.path.basename(os.path.dirname(os.path.abspath(pdb)))

        print bar
        print 'Scoring model {} against {}'.format(pdb, mtz)

        data_table = score_model(params=params,
                                 pdb1=pdb,
                                 mtz1=mtz,
                                 pdb2=params.input.ref_pdb,
                                 label_prefix=label)
        all_data = all_data.append(data_table, verify_integrity=True)

    print '...Done'
    print bar

    all_data.to_csv(scores_file)
    print 'Output written to {}'.format(scores_file)
    print bar

    ###################################################################
    # Image parameters
    ###################################################################
    columns = format_parameters_for_plot(params=params.plot.parameters)

    ###################################################################
    # Output Images - 1 image per residue per structure
    ###################################################################
    all_images = []
    print 'Generating Output Images...'
    for label, row in all_data.iterrows():
        image_path = os.path.join(images_dir, '{}.png'.format(label))
        print 'Making: {}...'.format(image_path)
        make_residue_radar_plot(
            path=image_path,
            data=row.to_frame().T,
            columns=columns,
            remove_blank_entries=params.plot.remove_blank_entries,
            print_axis_values=params.plot.print_axis_values)
        all_images.append(image_path)

    ###################################################################
    # Output Images - 1 image per residue (allowing comparisons)
    ###################################################################
    if params.radar_plot.limits == 'automatic': columns.pop('limits', None)
    elif params.radar_plot.limits == 'manual': pass

    for res_label, index_idxs in generate_group_idxs(
        [i.split('-')[-2:] for i in all_data.index]):
        res_label = '-'.join(res_label)
        image_path = os.path.join(images_dir,
                                  'compare-{}.png'.format(res_label))
        print 'Making: {}...'.format(image_path)
        make_residue_radar_plot(
            path=image_path,
            data=all_data.iloc[index_idxs],
            columns=columns,
            remove_blank_entries=params.plot.remove_blank_entries,
            print_axis_values=params.plot.print_axis_values)

    print '...Done.'
    print bar