Beispiel #1
0
    def test_different_cores(self):
        # use different frame_slice with different n_cores to test
        REF_0 = pt.iterload(fn('tz2.nc'), fn('tz2.parm7'))[0]

        for frame_slice in [
            (0, 100),
            (10, 100, 3),
            (50, 80, 2),
            (51, 80, 3),
        ]:
            traj = pt.iterload(
                fn('tz2.nc'), fn('tz2.parm7'), frame_slice=frame_slice)
            saved_angle = pt.angle(traj, ':3 :7 :9')
            saved_dist = pt.distance(traj, ':2 :10')
            saved_rmsd = pt.rmsd(traj, ref=REF_0, mask='@CA')

            lines = ['angle :3 :7 :9', 'distance :2 :10']

            for n_cores in [1, 2]:
                data_list = [
                    worker_by_state(rank, n_cores, traj, lines)
                    for rank in range(n_cores)
                ]
                final_data = concat_dict([x[1] for x in data_list])
                aa_eq(final_data['Ang_00002'], saved_angle)
                aa_eq(final_data['Dis_00003'], saved_dist)
Beispiel #2
0
def test_mpi_load_batch():
    # create ``comm`` so you can have the info about n_cpus, cpu id
    comm = MPI.COMM_WORLD

    # you are free to update anything below here
    # _load_batch_pmap is temp method name, will be changed in future

    traj_name = fn("tz2.nc")
    parm_name = fn("tz2.parm7")

    # load to TrajectoryIterator
    traj = pt.iterload(traj_name, parm_name, frame_slice=(0, 4000))

    # make a list of things you want to
    lines = ['autoimage', 'distance :3 :10', 'molsurf @CA']

    # gather the data to 1st core (rank=0)
    #
    n_cores = comm.size
    data = _load_batch_pmap(n_cores,
                            lines=lines,
                            traj=traj,
                            mode='mpi',
                            dtype='dict')

    if comm.rank != 0:
        assert data is None

    if comm.rank == 0:
        # each core return a tuple (core_id, dict)
        # so you need to concat the dict
        # use `from pytraj.tools import concat_dict
        data_0 = concat_dict(x[0] for x in data)

        # assert to serial version (do not need to copy below to your script)
        state = pt.load_batch(traj, lines)
        state.run()
        data = state.data[1:].to_dict()

        for key_0, key in zip(sorted(data_0.keys()), sorted(data.keys())):
            aa_eq(data_0[key_0], data[key])
Beispiel #3
0
def concat_hbond(data_collection):
    # TODO: update doc
    '''

    Parameters
    ----------
    data_collection : List[Tuple(OrderedDict[key, hbond], n_frames)]

    Returns
    -------
    OrderedDict[key, hbond]

    Notes
    -----
    data_collection will be updated.
    '''
    all_keys = set()
    for partial_data in data_collection:
        all_keys.update(partial_data[0].keys())
    excluded_keys = [key for key in all_keys if key.startswith('total')]

    for key in excluded_keys:
        all_keys.discard(key)

    for partial_data in data_collection:
        missing_keys = all_keys - set(partial_data[0].keys())
        n_frames = partial_data[1]
        if missing_keys:
            for key in missing_keys:
                partial_data[0][key] = np.zeros(n_frames)
    # val : Tuple[OrderedDict, n_frames]
    data_dict = concat_dict((val[0] for val in data_collection))

    # convert to int
    for key, val in data_dict.items():
        try:
            val = val.astype('i4')
        except ValueError:
            val = val
        data_dict[key] = val
    return data_dict
Beispiel #4
0
    def test_multiple_cores(self):
        from multiprocessing import Pool
        traj = pt.iterload(fn('tz2.nc'), fn('tz2.parm7'))
        saved_angle = pt.angle(traj, ':3 :10 :11')
        saved_dist = pt.distance(traj, ':3 :10')

        for n_cores in [1, 2]:
            lines = ['angle :3 :10 :11', 'distance :3 :10']
            pfuncs = partial(worker_by_state,
                             n_cores=n_cores,
                             traj=traj,
                             dtype='dict',
                             lines=lines)
            p = Pool(n_cores)
            data_list = p.map(pfuncs, [rank for rank in range(n_cores)])
            p.close()
            p.join()
            data_list_sorted_rank = (
                data[1] for data in sorted(data_list, key=lambda x: x[0]))
            final_data = concat_dict(data_list_sorted_rank)
            aa_eq(final_data['Ang_00002'], saved_angle)
            aa_eq(final_data['Dis_00003'], saved_dist)
Beispiel #5
0
    def process(self):
        # val : Tuple[OrdereDict, n_frames]

        if self.func in [matrix.dist, matrix.idea, volmap]:
            mat = np.sum(
                (val[0] * val[1] for val in self.data)) / self.traj.n_frames
            return mat
        elif self.func in [
                ired_vector_and_matrix,
        ]:
            # val : Tuple[(vecs, mat), n_frames]
            mat = np.sum(
                (val[0][1] * val[1] for val in self.data)) / self.traj.n_frames
            vecs = np.column_stack(val[0][0] for val in self.data)
            return (vecs, mat)
        elif self.func in [
                rotation_matrix,
        ]:
            if 'with_rmsd' in self.kwargs.keys() and self.kwargs['with_rmsd']:
                # val : Tuple[(mat, rmsd), n_frames]
                mat = np.row_stack(val[0][0] for val in self.data)
                rmsd_ = np.hstack(val[0][1] for val in self.data)
                return OrderedDict(out=(mat, rmsd_))
            else:
                # val : Tuple[mat, n_frames]
                mat = np.row_stack(val[0] for val in self.data)
                return OrderedDict(mat=mat)
        elif self.func == mean_structure:
            xyz = np.sum(
                (x[1] * x[0].xyz for x in self.data)) / self.traj.n_frames
            frame = Frame(xyz.shape[0])
            frame.xyz[:] = xyz
            return frame
        elif 'hbond' in self.func.__name__:
            return concat_hbond(self.data)
        else:
            return concat_dict((x[0] for x in self.data))
Beispiel #6
0
traj_name = root_dir + "tz2.nc"
parm_name = root_dir + "tz2.parm7"

# load to TrajectoryIterator
traj = pt.iterload(traj_name, parm_name, frame_slice=(0, 4000))

# make a list of things you want to
lines = ['autoimage', 'distance :3 :10', 'molsurf @CA']

# gather the data to 1st core (rank=0)
#
n_cores = comm.size
data = _load_batch_pmap(n_cores, lines=lines, traj=traj, mode='mpi', dtype='dict')

if comm.rank != 0:
    assert data is None

if comm.rank == 0:
    # each core return a tuple (core_id, dict)
    # so you need to concat the dict
    # use `from pytraj.tools import concat_dict
    data_0 = concat_dict(x[0] for x in data)

    # assert to serial version (do not need to copy below to your script)
    state = pt.load_batch(traj, lines)
    state.run()
    data = state.data[1:].to_dict()

    for key_0, key in zip(sorted(data_0.keys()), sorted(data.keys())):
        aa_eq(data_0[key_0], data[key])
Beispiel #7
0
def _pmap(func, traj, *args, **kwargs):
    '''use python's multiprocessing to accelerate calculation. Limited calculations.

    Parameters
    ----------
    func : a pytraj's methods or a list of string or simply as a cpptraj' text
    traj : pytraj.TrajectoryIterator
    n_cores : int, number of cores to be used, default 2. Specify n_cores=-1 to use all available cores
    iter_options : dict, default {}
        Specify trajectory iterating option. This will be done before calling ``func``.
    frame_indices : {None, array-like}, default None, optional
        if provided, pytraj will split this frame_indices into different chunks and let
        cpptraj perform calculation for specific indices.
        frame_indices must be pickable so is can be sent to different cores.

    *args, **kwargs: additional keywords

    Returns
    -------
    out : OrderedDict

    Notes
    -----
    - If you not sure about parallel's results, you should compare the output to serial run.

    - This is absolutely experimental. The syntax might be changed in future.

    Rule of thumbs: start with small number of frames (saying 10 frames), varying
    n_cores=1, 2, 3, 4 to see if the data makes sense or not.

    There are two modes in this method, use pytraj's methods (pytraj.rmsd, pytraj.radgyr,
    ...) or use cpptraj's command text syntax ('autoimage', 'rms', ...)

    If using cpptraj syntax::

        pytraj only supports limited cpptraj's Actions (not Analysis, checm Amber15 manual
        about Action and Analysis), say no  to 'matrix', 'atomicfluct', ... or any action
        that results output depending on the number of frames.


    This method only benifits you if your calculation is quite long (saying few minutes to
    few hours). For calculation that takes less than 1 minutes, you won't see the
    significant speed up (or even slower) since pytraj need to warm up and need to gather
    data when the calculation done.

    The parallel cacluation is very simple, trajectory will be split (almost equal) to
    different chunk (n_chunks = n_cores), pytraj/cpptraj perform calculation for each
    chunk in each core and then send data back to master. Note that we are using Python's
    built-in multiprocessing module, so you can use this method interactively in Ipython
    and ipython/jupyter notebook. This behavior is different from using MPI, in which you
    need to write a script, escaping ipython ession and type something like::

        mpirun -n 4 python my_script.py

    vs::

        In [1]: pt.pmap(pt.radgyr, traj, n_cores=4)
        Out[1]:
        OrderedDict([('RoG_00000',
                      array([ 18.91114428,  18.93654996,  18.84969884,  18.90449256,
                              18.8568644 ,  18.88917208,  18.9430491 ,  18.88878079,
                              18.91669565,  18.87069722]))])

    This is experimental method, you should expect its syntax, default output will be changed.

    When sending Topology to different cores, pytraj will reload Topology from
    traj.top.filename, so if you need to update Topology (in the fly), save it to disk and
    reload before using ``pytraj.pmap``

    Examples
    --------
    >>> import numpy as np
    >>> import pytraj as pt
    >>> traj = pt.load_sample_data('tz2')

    >>> # use iter_options
    >>> iter_options = {'autoimage': True, 'rmsfit': (0, '@CA')}
    >>> data = pt.pmap(pt.mean_structure, traj, iter_options=iter_options)

    >>> # cpptraj command style
    >>> data = pt.pmap(['distance :3 :7', 'vector mask :3 :12'], traj, n_cores=4)

    >>> # use reference. Need to explicitly use 'refindex', which is index of reflist
    >>> data = pt.pmap(['rms @CA refindex 0'], traj, ref=[traj[3],], n_cores=3)
    >>> data
    OrderedDict([('RMSD_00001', array([  2.68820312e-01,   3.11804885e-01,   2.58835452e-01,
             9.10475988e-08,   2.93310737e-01,   4.10197322e-01,
             3.96226694e-01,   3.66059215e-01,   3.90890362e-01,
             4.89180497e-01]))])

    >>> # use reference: if not want to use 'refindex', can use 'reference'
    >>> # the advantage is you can not specify a list of reference
    >>> data = pt.pmap(['rms @CA reference'], traj, ref=[traj[3],], n_cores=3)
    >>> data
    OrderedDict([('RMSD_00001', array([  2.68820312e-01,   3.11804885e-01,   2.58835452e-01,
             9.10475988e-08,   2.93310737e-01,   4.10197322e-01,
             3.96226694e-01,   3.66059215e-01,   3.90890362e-01,
             4.89180497e-01]))])

    >>> # use different references. Need to explicitly use 'refindex', which is index of reflist
    >>> # create a list of references
    >>> reflist = traj[3], traj[4]
    >>> # make sure to specify `refindex`
    >>> # `refindex 0` is equal to `reflist[0]`
    >>> # `refindex 1` is equal to `reflist[1]`
    >>> data = pt.pmap(['rms @CA refindex 0', 'rms !@H= refindex 1'], traj, ref=reflist, n_cores=2)
    >>> # convert to ndarray
    >>> data_arr = pt.tools.dict_to_ndarray(data)

    >>> # perform parallel calculation with given frame_indices
    >>> traj = pt.datafiles.load_tz2()
    >>> data = pt.pmap(pt.radgyr, traj, '@CA', frame_indices=range(10, 50), n_cores=4)
    >>> # serial version
    >>> data = pt.radgyr(traj, '@CA', frame_indices=range(10, 50))


    See also
    --------
    pytraj.pmap_mpi
    '''
    from multiprocessing import Pool
    from pytraj import TrajectoryIterator

    n_cores = kwargs.pop('n_cores') if 'n_cores' in kwargs else 2
    iter_options = kwargs.pop(
        'iter_options') if 'iter_options' in kwargs else {}
    apply = kwargs.pop('apply') if 'apply' in kwargs else None
    progress = kwargs.pop('progress') if 'progress' in kwargs else None
    progress_params = kwargs.pop(
        'progress_params') if 'progress_params' in kwargs else dict()

    if n_cores <= 0:
        # use all available cores
        n_cores = cpu_count()

    # update reference
    if 'ref' in kwargs:
        kwargs['ref'] = get_reference(traj, kwargs['ref'])

    if isinstance(func, (list, tuple, string_types)):
        # assume using _load_batch_pmap
        from pytraj.parallel.base import _load_batch_pmap
        #check_valid_command(func)
        data = _load_batch_pmap(n_cores=n_cores,
                                traj=traj,
                                lines=func,
                                dtype='dict',
                                root=0,
                                mode='multiprocessing',
                                **kwargs)
        data = concat_dict((x[0] for x in data))
        return data
    else:
        if not callable(func):
            raise ValueError('must callable argument')
        # pytraj's method
        if not hasattr(func, '_is_parallelizable'):
            raise ValueError("this method does not support parallel")
        elif not func._is_parallelizable:
            raise ValueError("this method does not support parallel")
        else:
            if hasattr(
                    func, '_openmp_capability'
            ) and func._openmp_capability and 'OPENMP' in compiled_info():
                raise RuntimeError(
                    "this method supports both openmp and pmap, but your cpptraj "
                    "version was installed with openmp. Should not use both openmp and pmap at the "
                    "same time. In this case, do not use pmap since openmp is more efficient"
                )

        if not isinstance(traj, TrajectoryIterator):
            raise ValueError('only support TrajectoryIterator')

        if 'dtype' not in kwargs and func not in [
                mean_structure,
                matrix.dist,
                matrix.idea,
                ired_vector_and_matrix,
                rotation_matrix,
                volmap,
        ]:
            kwargs['dtype'] = 'dict'

        # keyword
        if func is volmap:
            assert kwargs.get('size') is not None, 'must provide "size" value'

        p = Pool(n_cores)

        pfuncs = partial(worker_by_func,
                         n_cores=n_cores,
                         func=func,
                         traj=traj,
                         args=args,
                         kwargs=kwargs,
                         iter_options=iter_options,
                         apply=apply,
                         progress=progress,
                         progress_params=progress_params)

        data = p.map(pfuncs, [rank for rank in range(n_cores)])
        p.close()

        dataset_processor = PmapDataset(data,
                                        func=func,
                                        kwargs=kwargs,
                                        traj=traj)
        return dataset_processor.process()
Beispiel #8
0
def pmap_mpi(func, traj, *args, **kwargs):
    """parallel with MPI (mpi4py)

    Parameters
    ----------
    func : a function
    traj : pytraj.TrajectoryIterator
    *args, **kwargs: additional arguments

    Examples
    --------
    .. code-block:: bash

        $ # create test_radgyr.py file
        $ cat > test_radgyr.py <<EOF
        import pytraj as pt
        from mpi4py import MPI
        comm = MPI.COMM_WORLD

        traj = pt.iterload('tz2.nc', 'tz2.parm7')

        result_arr = pt.pmap_mpi(pt.radgyr, traj, "@CA")

        if comm.rank == 0:
            # save data to disk to read later by pytraj.read_pickle
            # pt.to_pickle(result_arr, 'output.pk')
            print(result_arr)
        EOF

        $ # run in parallel
        $ mpirun -n 4 python ./test_radgyr.py
        [array([ 8.10916061,  7.7643485 ,  8.09693108, ...,  9.70825678,
                9.3161563 ,  8.86720964]), array([ 8.82037273,  8.89008289,  9.48540176, ...,  9.29585981,
                9.53138062,  9.19155977]), array([ 9.13735723,  8.94651001,  8.97810478, ...,  7.68751186,
                8.31361647,  7.83763754]), array([ 7.37423766,  7.05637263,  6.52135566, ...,  6.38061648,
                6.24139008,  6.48994552])]
    """
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    n_cores = comm.size
    rank = comm.rank

    # update reference
    if 'ref' in kwargs:
        kwargs['ref'] = get_reference(traj, kwargs['ref'])

    if not isinstance(func, (list, tuple)):
        # split traj to ``n_cores`` chunks, perform calculation
        # for rank-th chunk
        if 'dtype' not in kwargs:
            kwargs['dtype'] = 'dict'

        frame_indices = kwargs.pop('frame_indices', None)
        if frame_indices is None:
            start, stop = split_range(n_cores, 0, traj.n_frames)[rank]
            my_iter = traj.iterframe(start=start, stop=stop)
        else:
            my_indices = np.array_split(frame_indices, n_cores)[rank]
            my_iter = traj.iterframe(frame_indices=my_indices)
        n_frames = my_iter.n_frames
        data = func(my_iter, *args, **kwargs)
        # total : List[OrderedDict or Any]
        total = comm.gather(data, root=0)
        n_frames_collection = comm.gather(n_frames, root=0)
        if rank == 0:
            data_collection = [
                (val, n_frames_)
                for (val, n_frames_) in zip(total, n_frames_collection)
            ]
            dataset_processor = PmapDataset(data_collection,
                                            func=func,
                                            traj=traj,
                                            kwargs=kwargs)
            return dataset_processor.process()
    else:
        # cpptraj command style
        from pytraj.parallel.base import _load_batch_pmap
        total = _load_batch_pmap(n_cores=n_cores,
                                 traj=traj,
                                 lines=func,
                                 dtype='dict',
                                 root=0,
                                 mode='mpi',
                                 **kwargs)
        if rank == 0:
            # otherwise, total=None
            total = concat_dict((x[0] for x in total))
    return total