def get_clusters(self): self.build_graph() threshold = 9 clustering = True while (clustering and threshold >=4): edge_centrality_map = clustering_algorithm.\ betweenness_centrality_clustering(graph=self.g, threshold=threshold) components = cca.connected_components(graph=self.g) components_size = [ len(component) for component in components] #print "max(components_size): ",max(components_size) if max(components_size) <= self.maxnum_residues_in_cluster: clustering = False else: threshold -= 1 final_components = [] for component in components: component[:] = [x + 1 for x in component] if len(component) > self.maxnum_residues_in_cluster: component.sort() index = int(len(component)/2) final_components.append(component[:index]) final_components.append(component[index:]) else: final_components.append(component) return final_components
def connected_segments(self): from boost_adaptbx.graph import connected_component_algorithm as cca res = cca.connected_components(graph=self.graph) atom_for = self.atom_for return [[atom_for[v] for v in comp] for comp in res]
def manipulation(self, g): vd1 = g.add_vertex() vd2 = g.add_vertex() vd3 = g.add_vertex() g.add_edge(vertex1=vd1, vertex2=vd2) components = cca.connected_components(graph=g) self.assertEqual(len(components), 2) self.assertEqual( set([frozenset(c) for c in components]), set([frozenset([vd1, vd2]), frozenset([vd3])]), )
def build_and_test(self, g, params): (threshold, exp_ecs, exp_comps) = params (vds, eds) = self.graph_build(g) ecmap = clustering_algorithm.betweenness_centrality_clustering( graph=g, threshold=threshold, ) self.assertTrue(eds[3] not in ecmap) self.assertEqual(len(ecmap), len(exp_ecs)) self.assertEqual(exp_ecs, [ecmap[ed] for ed in eds if ed != eds[3]]) if exp_comps is not None: from boost_adaptbx.graph import connected_component_algorithm as cca comps = cca.connected_components(graph=g) self.assertEqual( set(frozenset(c) for c in comps), set(frozenset(vds[i] for i in c) for c in exp_comps))
def build_and_test(self, g, params): ( threshold, exp_ecs, exp_comps ) = params ( vds, eds ) = self.graph_build( g ) ecmap = clustering_algorithm.betweenness_centrality_clustering( graph = g, threshold = threshold, ) self.assertTrue( eds[3] not in ecmap ) self.assertEqual( len( ecmap ), len( exp_ecs ) ) self.assertEqual( exp_ecs, [ ecmap[ ed ] for ed in eds if ed != eds[3] ] ) if exp_comps is not None: from boost_adaptbx.graph import connected_component_algorithm as cca comps = cca.connected_components( graph = g ) self.assertEqual( set( frozenset( c ) for c in comps ), set( frozenset( vds[i] for i in c ) for c in exp_comps ) )
def __init__(self, pdb_hierarchy, crystal_symmetry, angular_difference_threshold_deg=5., sequence_identity_threshold=90., quiet=False): h = pdb_hierarchy superposition_threshold = 2 * sequence_identity_threshold - 100. n_atoms_all = h.atoms_size() s_str = "altloc ' ' and (protein or nucleotide)" h = h.select(h.atom_selection_cache().selection(s_str)) h1 = iotbx.pdb.hierarchy.root() h1.append_model(h.models()[0].detached_copy()) unit_cell = crystal_symmetry.unit_cell() result = {} if not quiet: print("Find groups of chains related by translational NCS") # double loop over chains to find matching pairs related by pure translation for c1 in h1.chains(): c1.parent().remove_chain(c1) nchains = len(h1.models()[0].chains()) if ([c1.is_protein(), c1.is_na()].count(True) == 0): continue r1 = list(c1.residues()) c1_seq = "".join(c1.as_sequence()) sc_1_tmp = c1.atoms().extract_xyz() h1_p1 = h1.expand_to_p1(crystal_symmetry=crystal_symmetry) for (ii, c2) in enumerate(h1_p1.chains()): orig_c2 = h1.models()[0].chains()[ii % nchains] r2 = list(c2.residues()) c2_seq = "".join(c2.as_sequence()) sites_cart_1, sites_cart_2 = None, None sc_2_tmp = c2.atoms().extract_xyz() # chains are identical if (c1_seq == c2_seq and sc_1_tmp.size() == sc_2_tmp.size()): sites_cart_1 = sc_1_tmp sites_cart_2 = sc_2_tmp p_identity = 100. # chains are not identical, do alignment else: align_obj = mmtbx.alignment.align(seq_a=c1_seq, seq_b=c2_seq) alignment = align_obj.extract_alignment() matches = alignment.matches() equal = matches.count("|") total = len(alignment.a) - alignment.a.count("-") p_identity = 100. * equal / max(1, total) if (p_identity > superposition_threshold): sites_cart_1 = flex.vec3_double() sites_cart_2 = flex.vec3_double() for i1, i2, match in zip(alignment.i_seqs_a, alignment.i_seqs_b, matches): if (i1 is not None and i2 is not None and match == "|"): r1i, r2i = r1[i1], r2[i2] assert r1i.resname == r2i.resname, [ r1i.resname, r2i.resname, i1, i2 ] for a1 in r1i.atoms(): for a2 in r2i.atoms(): if (a1.name == a2.name): sites_cart_1.append(a1.xyz) sites_cart_2.append(a2.xyz) break # superpose two sequence-aligned chains if ([sites_cart_1, sites_cart_2].count(None) == 0): lsq_fit_obj = superpose.least_squares_fit( reference_sites=sites_cart_1, other_sites=sites_cart_2) angle = lsq_fit_obj.r.rotation_angle() t_frac = unit_cell.fractionalize( (sites_cart_1 - sites_cart_2).mean()) t_frac = [math.modf(t)[0] for t in t_frac] # put into [-1,1] radius = flex.sum( flex.sqrt((sites_cart_1 - sites_cart_1.mean() ).dot())) / sites_cart_1.size() * 4. / 3. fracscat = min(c1.atoms_size(), c2.atoms_size()) / n_atoms_all result.setdefault(frozenset([c1, orig_c2]), []).append([ p_identity, [lsq_fit_obj.r, t_frac, angle, radius, fracscat] ]) else: result.setdefault(frozenset([c1, orig_c2]), []).append([p_identity, None]) # Build graph g = graph.adjacency_list() vertex_handle = {} for key in result: seqid = result[key][0][0] sup = min(result[key], key=lambda s: 0 if s[1] is None else s[1][2])[1] result[key] = [seqid, sup] if ((seqid > sequence_identity_threshold) and (sup[2] < angular_difference_threshold_deg)): (c1, c2) = key if (c1 not in vertex_handle): vertex_handle[c1] = g.add_vertex(label=c1) if (c2 not in vertex_handle): vertex_handle[c2] = g.add_vertex(label=c2) g.add_edge(vertex1=vertex_handle[c1], vertex2=vertex_handle[c2]) # Do connected component analysis and compose final tNCS pairs object components = connected_component_algorithm.connected_components(g) import itertools self.ncs_pairs = [] self.tncsresults = [0, "", [], 0.0] for (i, group) in enumerate(components): chains = [g.vertex_label(vertex=v) for v in group] fracscats = [] radii = [] for pair in itertools.combinations(chains, 2): sup = result[frozenset(pair)][1] fracscats.append(sup[-1]) radii.append(sup[-2]) fs = sum(fracscats) / len(fracscats) self.tncsresults[3] = fs # store fracscat in array rad = sum(radii) / len(radii) #import code, traceback; code.interact(local=locals(), banner="".join( traceback.format_stack(limit=10) ) ) maxorder = 1 vectors = [] previous_id = next(itertools.combinations(chains, 2))[0].id for pair in itertools.combinations(chains, 2): sup = result[frozenset(pair)][1] ncs_pair = ext.pair( r=sup[0], t=sup[1], radius=rad, radius_estimate=rad, fracscat=fs, rho_mn=flex.double( ), # rho_mn undefined, needs to be set later id=i) self.ncs_pairs.append(ncs_pair) # show tNCS pairs in group fmt = "group %d chains %s <> %s angle: %4.2f trans.vect.: (%s) fracscat: %5.3f" t = ",".join([("%6.3f" % t_).strip() for t_ in sup[1]]).strip() if not quiet: print(fmt % (i, pair[0].id, pair[1].id, sup[2], t, fs)) if pair[0].id == previous_id: maxorder += 1 orthoxyz = unit_cell.orthogonalize(sup[1]) vectors.append((sup[1], orthoxyz, sup[2])) else: previous_id = pair[0].id maxorder = 1 vectors = [] if maxorder > self.tncsresults[0]: self.tncsresults[0] = maxorder self.tncsresults[1] = previous_id self.tncsresults[2] = vectors if not quiet: print("Largest TNCS order, peptide chain, fracvector, orthvector, angle, fracscat = ", \ str(self.tncsresults))
def connected_components(miller_array: cctbx.miller.array, ) -> [{}]: """ Identify connected regions of missing reflections in the asymmetric unit. This is achieved by first generating the complete set of possible miller indices, then performing connected components analysis on a graph of nearest neighbours in the list of missing reflections. Args: miller_array: The input list of reflections. Returns: The list of miller sets for each connected region of missing reflections. The first item in the list will be the complete set of all possible miller indices. """ # Map to primitive setting for centred cells, otherwise true missing reflections # won't be identified as connected as a result of being separated by systematically # absent reflections. cb_op_to_primitive = miller_array.change_of_basis_op_to_primitive_setting() miller_array = miller_array.change_basis(cb_op_to_primitive) # First generate the missing_set of reflections. We want the full sphere of missing # reflections to allow us to find connected regions that cross the boundary of the # asu. unique = miller_array.unique_under_symmetry().map_to_asu() unique = unique.generate_bijvoet_mates() complete_set = unique.complete_set() missing_set = complete_set.lone_set(unique) missing_set = missing_set.expand_to_p1().customized_copy( crystal_symmetry=missing_set.crystal_symmetry()) if missing_set.size() == 0: return complete_set, [] # Now find the nearest neighbours. mi = missing_set.indices().as_vec3_double().as_double() k = 6 ann = AnnAdaptor(data=mi, dim=3, k=k) ann.query(mi) # Construct the graph of connected missing reflections g = graph.adjacency_list( graph_type="undirected", vertex_type="vector", edge_type="set", ) distance_cutoff = 2**0.5 for i in range(missing_set.size()): ik = i * k for i_ann in range(k): if ann.distances[ik + i_ann] <= distance_cutoff: j = ann.nn[ik + i_ann] g.add_edge(i, j) # Now do the connected components analysis, filtering out lone missing reflections components = [c for c in cca.connected_components(graph=g) if len(c) > 1] # Determine the unique miller indices for each component within the asu unique_mi = [] unique_ms = [] for i, c in enumerate(components): ms = (missing_set.select(flex.size_t(list(c))).customized_copy( crystal_symmetry=miller_array).as_non_anomalous_set().map_to_asu()) ms = ms.unique_under_symmetry() mi = set(ms.indices()) if mi not in unique_mi: unique_ms.append(ms) unique_mi.append(mi) # Sort connected regions by size unique_ms = sorted(unique_ms, key=lambda ms: ms.size(), reverse=True) # Map indices back to input setting cb_op_primitive_inp = cb_op_to_primitive.inverse() return ( unique.as_non_anomalous_set().complete_set().change_basis( cb_op_primitive_inp), [ms.change_basis(cb_op_primitive_inp) for ms in unique_ms], )
def Test(): """Test function for all functions provided above. returns: Empty string on success, string describing the problem on failure. """ # Construct a set of Mover/atoms that will be used to test the routines. They will all be part of the # same residue and they will all have the same unit radius in the extraAtomInfo associated with them. # There will be a set of five along the X axis, with one pair overlapping slightly and the others # spaced 0.45 units apart so that they will overlap when using a probe radius of 0.25 (diameter 0.5). # There will be another one that is obliquely located away from the first such that it will overlap # in a bounding-box test but not in a true atom-comparison test for a probe with radius 0.25. There # will be a final one 10 units above the origin. rad = 1.0 probeRad = 0.25 locs = [[0.0, 0.0, 0.0], [1.9, 0.0, 0.0]] for i in range(1, 4): loc = [1.9 + 2.1 * i, 0.0, 0.0] locs.append(loc) delta = 2 * rad + 2 * probeRad - 0.1 dist = -delta * math.cos(math.pi / 4) dist = -delta * math.sin(math.pi / 4) locs.append([dist, dist, 0.0]) locs.append([0.0, 0.0, 10.0]) name = " H " ag = pdb.hierarchy.atom_group() ag.resname = "LYS" atoms = pdb.hierarchy.af_shared_atom() extras = [] movers = [] baseAtom = pdb.hierarchy.atom() for i in range(len(locs)): a = pdb.hierarchy.atom(parent=ag, other=baseAtom) a.name = name a.xyz = locs[i] atoms.append(a) e = probe.ExtraAtomInfo(rad) extras.append(e) extrasMap = probeExt.ExtraAtomInfoMap(atoms, extras) movers.append(Movers.MoverNull(a, extrasMap)) # Fix the sequence numbers, which are otherwise all 0 atoms.reset_i_seq() # Generate a table of parameters and expected results. The first entry in each row is # the probe radius. The second is the expected number of connected components. # The third is the size of the largest connected component. _expectedCases = [[0.0, 5, 3], [probeRad, 2, 6], [100, 1, 7]] # Specify the probe radius and run the test. Compare the results to what we expect. for i, e in enumerate(_expectedCases): probeRadius = e[0] g = _InteractionGraphAABB(movers, extrasMap, probeRadius) # Find the connected components of the graph and compare their counts and maximum size to # what is expected. components = cca.connected_components(graph=g) if len(components) != e[1]: return "AABB Expected " + str(e[1]) + " components, found " + str( len(components)) + " for case " + str(i) maxLen = -1 for c in components: if len(c) > maxLen: maxLen = len(c) if maxLen != e[2]: return "AABB Expected max sized component of " + str( e[2]) + ", found " + str(maxLen) + " for case " + str(i) # Generate a table of parameters and expected results. The first entry in each row is # the probe radius. The second is the expected number of connected components. # The third is the size of the largest connected component. # The fourth (not present in the AABB table above) is the set of expected sizes of # atomMoverSets across all atoms; not one per atom but across all atoms what answers are # expected. The easiest to explain is the 100-radius entry, which should have all atoms interacting # with all Movers so the only answer across all atoms is 7. The 0-radius case has only one pair # of overlaps, so only up to 2 Movers per atom. The middle case has some Movers overlapping with # two neighbors, so up to 3 Movers associated with a given atom. _expectedCases = [ # One of the pairs actually does not overlap for the all-pairs test. Other conditions are the same # as the AABB tests. [0.0, 6, 2, {1, 2}], [probeRad, 2, 6, {1, 2, 3}], [100, 1, 7, {7}] ] # Specify the probe radius and run the test. Compare the results to what we expect. for i, e in enumerate(_expectedCases): probeRadius = e[0] g, am = InteractionGraphAllPairs(movers, extrasMap, probeRadius) # Find the connected components of the graph and compare their counts and maximum size to # what is expected. components = cca.connected_components(graph=g) if len(components) != e[1]: return "Expected " + str(e[1]) + " components, found " + str( len(components)) + " for case " + str(i) maxLen = -1 for c in components: if len(c) > maxLen: maxLen = len(c) if maxLen != e[2]: return "Expected max sized component of " + str( e[2]) + ", found " + str(maxLen) + " for case " + str(i) # Check atom/Mover overlaps by finding the set of lengths that are present accross all atoms. lengths = set() for a in atoms: lengths.add(len(am[a])) if lengths != e[3]: return "Expected set of overlap counts " + str( e[3]) + ", found " + str(lengths) + " for case " + str(i) return ""
def __init__(self, pdb_hierarchy, crystal_symmetry, angular_difference_threshold_deg=5., sequence_identity_threshold=90.): h = pdb_hierarchy superposition_threshold = 2*sequence_identity_threshold - 100. n_atoms_all = h.atoms_size() s_str = "altloc ' ' and (protein or nucleotide)" h = h.select(h.atom_selection_cache().selection(s_str)) h1 = iotbx.pdb.hierarchy.root() h1.append_model(h.models()[0].detached_copy()) unit_cell = crystal_symmetry.unit_cell() result = {} print "Find groups of chains related by translational NCS" # double loop over chains to find matching pairs related by pure translation for c1 in h1.chains(): c1.parent().remove_chain(c1) nchains = len(h1.models()[0].chains()) if([c1.is_protein(), c1.is_na()].count(True)==0): continue r1 = list(c1.residues()) c1_seq = "".join(c1.as_sequence()) sc_1_tmp = c1.atoms().extract_xyz() h1_p1 = h1.expand_to_p1(crystal_symmetry=crystal_symmetry) for (ii,c2) in enumerate(h1_p1.chains()): orig_c2 = h1.models()[0].chains()[ii%nchains] r2 = list(c2.residues()) c2_seq = "".join(c2.as_sequence()) sites_cart_1, sites_cart_2 = None,None sc_2_tmp = c2.atoms().extract_xyz() # chains are identical if(c1_seq==c2_seq and sc_1_tmp.size()==sc_2_tmp.size()): sites_cart_1 = sc_1_tmp sites_cart_2 = sc_2_tmp p_identity = 100. # chains are not identical, do alignment else: align_obj = mmtbx.alignment.align(seq_a = c1_seq, seq_b = c2_seq) alignment = align_obj.extract_alignment() matches = alignment.matches() equal = matches.count("|") total = len(alignment.a) - alignment.a.count("-") p_identity = 100.*equal/max(1,total) if(p_identity>superposition_threshold): sites_cart_1 = flex.vec3_double() sites_cart_2 = flex.vec3_double() for i1, i2, match in zip(alignment.i_seqs_a, alignment.i_seqs_b, matches): if(i1 is not None and i2 is not None and match=="|"): r1i, r2i = r1[i1], r2[i2] assert r1i.resname==r2i.resname, [r1i.resname,r2i.resname,i1,i2] for a1 in r1i.atoms(): for a2 in r2i.atoms(): if(a1.name == a2.name): sites_cart_1.append(a1.xyz) sites_cart_2.append(a2.xyz) break # superpose two sequence-aligned chains if([sites_cart_1,sites_cart_2].count(None)==0): lsq_fit_obj = superpose.least_squares_fit( reference_sites = sites_cart_1, other_sites = sites_cart_2) angle = lsq_fit_obj.r.rotation_angle() t_frac = unit_cell.fractionalize((sites_cart_1-sites_cart_2).mean()) t_frac = [math.modf(t)[0] for t in t_frac] # put into [-1,1] radius = flex.sum(flex.sqrt((sites_cart_1- sites_cart_1.mean()).dot()))/sites_cart_1.size()*4./3. fracscat = min(c1.atoms_size(),c2.atoms_size())/n_atoms_all result.setdefault( frozenset([c1,orig_c2]), [] ).append( [p_identity,[lsq_fit_obj.r, t_frac, angle, radius, fracscat]] ) else: result.setdefault( frozenset([c1,orig_c2]), [] ).append( [p_identity,None] ) # Build graph g = graph.adjacency_list() vertex_handle = {} for key in result: seqid = result[key][0][0] sup = min( result[key],key=lambda s:0 if s[1] is None else s[1][2])[1] result[key] = [seqid,sup] if ((seqid > sequence_identity_threshold) and (sup[2] < angular_difference_threshold_deg)): (c1,c2) = key if (c1 not in vertex_handle): vertex_handle[c1] = g.add_vertex(label=c1) if (c2 not in vertex_handle): vertex_handle[c2] = g.add_vertex(label=c2) g.add_edge(vertex1=vertex_handle[c1],vertex2=vertex_handle[c2]) # Do connected component analysis and compose final tNCS pairs object components = connected_component_algorithm.connected_components(g) import itertools self.ncs_pairs = [] for (i,group) in enumerate(components): chains = [g.vertex_label(vertex=v) for v in group] fracscats = [] radii = [] for pair in itertools.combinations(chains,2): sup = result[frozenset(pair)][1] fracscats.append(sup[-1]) radii.append(sup[-2]) fs = sum(fracscats)/len(fracscats) rad = sum(radii)/len(radii) for pair in itertools.combinations(chains,2): sup = result[frozenset(pair)][1] ncs_pair = ext.pair( r = sup[0], t = sup[1], radius = rad, radius_estimate = rad, fracscat = fs, rho_mn = flex.double(), # rho_mn undefined, needs to be set later id = i) self.ncs_pairs.append(ncs_pair) # show tNCS pairs in group fmt="group %d chains %s <> %s angle: %4.2f trans.vect.: (%s) fracscat: %5.3f" t = ",".join([("%6.3f"%t_).strip() for t_ in sup[1]]).strip() print fmt%(i, pair[0].id, pair[1].id, sup[2], t, fs)