Exemple #1
0
 def test_plot_leafs(self, small_tree):
     tree = small_tree['tree']
     ss_props = list()
     for i in range(tree.nleafs):
         seq = ''.join(random.sample(1000*SSP.dssp_codes, 42))
         ss_props.append(SSP().from_dssp_sequence(seq))
     ps.propagator_size_weighted_sum(ss_props, tree)
     tree.root['ss'].plot('leafs')
Exemple #2
0
def sans_fit(sans_benchmark):
    r"""

    Parameters
    ----------
    sans_benchmark : :function:`~pytest.fixture`

    Returns
    -------
    dict
        A dictionary containing the following key, value pairs:
    tree: :class:`~idpflex.cnextend.Tree`
        A hiearchical tree with random distances among leafs, and endowed
        with a :class:`~idpflex.properties.SansProperty`.
    property_name: str
        Just the name of the property
    depth: int
        Tree depth resulting in the best fit to experiment_property
    coefficients: :py:`dict`
        weights of each node at Tree depth resulting in best fit. (key, val)
        pair is (node ID, weight).
    background : float
        Flat background added to the profile at depth for optimal fit
    experiment_property: :class:`~idpflex.properties.SansProperty`
        Experimental profile from a linear combination of the profiles
        at depth for optimal fit using `coefficients` and `background`.
    """
    tree = deepcopy(sans_benchmark['tree_with_no_property'])
    values = sans_benchmark['property_list']
    name = values[0].name  # property name
    idprop.propagator_size_weighted_sum(values, tree)
    # create a SANS profile as a linear combination of the clusters at a
    # particular depth
    depth = 4
    coeffs = (0.45, 0.00, 0.07, 0.25, 0.23)  # they must add to one
    coefficients = dict()
    nodes = tree.nodes_at_depth(depth)
    n_nodes = 1 + depth  # depth=0 corresponds to the root node (nclusters=1)
    q_values = (tree.root[name].x[:-1] + tree.root[name].x[1:]) / 2  # midpoint
    profile = np.zeros(len(q_values))
    for i in range(n_nodes):
        coefficients[nodes[i].id] = coeffs[i]
        p = nodes[i][name]
        profile += coeffs[i] * (p.y[:-1] + p.y[1:]) / 2
    background = 0.05 * max(profile)  # flat background
    profile += background
    experiment_property = idprop.SansProperty(name=name,
                                              qvalues=q_values,
                                              profile=profile,
                                              errors=0.1 * profile)
    return {
        'tree': tree,
        'property_name': name,
        'depth': depth,
        'coefficients': coefficients,
        'background': background,
        'experiment_property': experiment_property
    }
Exemple #3
0
 def test_propagator_size_weighted_sum(self, sans_benchmark):
     tree = sans_benchmark['tree_with_no_property']
     values = sans_benchmark['property_list']
     ps.propagator_size_weighted_sum(values, tree)
     # Test the propagation of the profiles for a node randomly picked
     node_id = np.random.randint(tree.nleafs, len(tree))  # exclude leafs
     node = tree[node_id]
     ln = node.left
     rn = node.right
     w = float(ln.count) / (ln.count + rn.count)
     lnp = ln['sans']  # profile of the "left" sibling node
     rnp = rn['sans']
     y = w * lnp.y + (1 - w) * rnp.y
     assert np.array_equal(y, node['sans'].y)
Exemple #4
0
def benchmark():
    z = np.loadtxt(os.path.join(data_dir, 'linkage_matrix'))
    t = cnextend.Tree(z)
    n_leafs = 22379
    # Instantiate scalar properties for the leaf nodes, then propagate
    # up the tree
    sc = np.random.normal(loc=100.0, size=n_leafs)
    sc_p = [idprop.ScalarProperty(name='sc', y=s) for s in sc]
    idprop.propagator_size_weighted_sum(sc_p, t)
    return {
        'z': z,
        'tree': t,
        'nnodes': 44757,
        'nleafs': n_leafs,
        'simple_property': [SimpleProperty(i) for i in range(22379)],
    }
Exemple #5
0
 def test_propagator_size_weighted_sum(self, small_tree):
     r"""Create random secondary sequences by shufling all codes and
     assign to the leafs of the tree. Then, propagate the profiles up
     the tree hiearchy. Finally, compare the profile of the root with
     expected profile.
     """
     tree = small_tree['tree']
     ss_props = list()
     for i in range(tree.nleafs):
         seq = ''.join(random.sample(SSP.dssp_codes, SSP.n_codes))
         ss_props.append(SSP().from_dssp_sequence(seq))
     ps.propagator_size_weighted_sum(ss_props, tree)
     # Manually calculate the average profile for the last residue
     y = np.asarray([ss_props[i].y for i in range(tree.nleafs)])
     average_profile = np.mean(y, axis=0)
     np.testing.assert_array_almost_equal(average_profile,
                                          tree.root['ss'].y, decimal=12)
Exemple #6
0
def cluster_with_properties(a_universe,
                            pcls,
                            p_names=None,
                            selection='not name H*',
                            segment_length=1000,
                            n_representatives=1000):
    r"""Cluster a set of representative structures by structural similarity
    (RMSD) and by a set of properties

    The simulated trajectory is divided into segments, and hierarchical
    clustering is performed on each segment to yield a limited number of
    representative structures (the centroids). Properties are calculated
    for each centroid, thus each centroid is described by a property
    vector. The dimensionality of the vector is related to the number of
    properties and the dimensionality of each property.
    The distances between any two centroids is calculated as the
    Euclidean distance between their respective vector properties.
    The distance matrix containing distances between all possible
    centroid pairs is employed as the similarity measure to generate
    the hierarchical tree of centroids.

    The properties calculated for the centroids are stored in the
    leaf nodes of the hierarchical tree. Properties are then propagated
    up to the tree's root node.

    Parameters
    ----------
    a_universe : :class:`~MDAnalysis.core.universe.Universe`
        Topology and trajectory.
    pcls : list
        Property classes, such as :class:`~idpflex.properties.Asphericity`
        of :class:`~idpflex.properties.SaSa`
    p_names : list
        Property names. If None, then default property names are used
    selection : str
        atoms for which to calculate RMSD. See the
        `selections page <https://www.mdanalysis.org/docs/documentation_pages/selections.html>`_
        for atom selection syntax.
    segment_length: int
        divide trajectory into segments of this length
    n_representatives : int
        Desired total number of representative structures. The final number
        may be close but not equal to the desired number.

    Returns
    -------
    :class:`~idpflex.cluster.ClusterTrove`
        Hierarchical clustering tree of the centroids
    """  # noqa: E501
    rep_ifr = trajectory_centroids(a_universe,
                                   selection=selection,
                                   segment_length=segment_length,
                                   n_representatives=n_representatives)
    n_centroids = len(rep_ifr)  # can be different than n_representatives

    # Create names if not passed
    if p_names is None:
        p_names = [Property.default_name for Property in pcls]

    # Calculate properties for each centroid
    l_prop = list()
    for p_name, Pcl in zip(p_names, pcls):

        l_prop.append([
            Pcl(name=p_name).from_universe(a_universe, index=i)
            for i in tqdm(rep_ifr)
        ])

    # Calculate distances between pair of centroids
    xyz = np.zeros((len(pcls), n_centroids))
    for i_prop, prop in enumerate(l_prop):
        xyz[i_prop] = [p.y for p in prop]
    # zero mean and unity variance for each property
    xyz = np.transpose(zscore(xyz, axis=1))
    distance_matrix = squareform(scipy.spatial.distance_matrix(xyz, xyz))

    # Cluster the representative structures
    tree = Tree(z=hierarchy.linkage(distance_matrix, method='complete'))
    for i_leaf, leaf in enumerate(tree.leafs):
        leaf.add_property(ScalarProperty(name='iframe', y=rep_ifr[i_leaf]))

    # Propagate the properties up the tree
    [propagator_size_weighted_sum(prop, tree) for prop in l_prop]

    return ClusterTrove(rep_ifr, distance_matrix, tree)