Beispiel #1
0
class RDF(AnalysisHook):
    def __init__(self,
                 rcut,
                 rspacing,
                 f=None,
                 start=0,
                 end=-1,
                 max_sample=None,
                 step=None,
                 select0=None,
                 select1=None,
                 pairs_sr=None,
                 nimage=0,
                 pospath='trajectory/pos',
                 poskey='pos',
                 cellpath=None,
                 cellkey=None,
                 outpath=None):
        """Computes a radial distribution function (RDF)

           **Argument:**

           rcut
                The cutoff for the RDF analysis. This should be lower than the
                spacing between the primitive cell planes, multiplied by (1+2*nimage).

           rspacing
                The width of the bins to build up the RDF.

           **Optional arguments:**

           f
                An h5.File instance containing the trajectory data. If ``f``
                is not given, or it does not contain the dataset referred to
                with the ``path`` argument, an on-line analysis is carried out.

           start, end, max_sample, step
                arguments to setup the selection of time slices. See
                ``get_slice`` for more information.

           select0
                A list of atom indexes that are considered for the computation
                of the rdf. If not given, all atoms are used.

           select1
                A list of atom indexes that are needed to compute an RDF between
                two disjoint sets of atoms. (If there is some overlap between
                select0 and select1, an error will be raised.) If this is None,
                an 'internal' RDF will be computed for the atoms specified in
                select0.

           pairs_sr
                An array with short-range pairs of atoms (shape K x 2). When
                given, an additional RDFs is generated for the short-range pairs
                (rdf_sr).

           nimage
                The number of cell images to consider in the computation of the
                pair distances. By default, this is zero, meaning that only the
                minimum image convention is used.

           pospath
                The path of the dataset that contains the time dependent data in
                the HDF5 file. The first axis of the array must be the time
                axis. This is only needed for an off-line analysis

           poskey
                In case of an on-line analysis, this is the key of the state
                item that contains the data from which the RDF is derived.

           cellpath
                The path the time-dependent cell vector data. This is only
                needed when the cell parameters are variable and the analysis is
                off-line.

           cellkey
                The key of the stateitem that contains the cell vectors. This
                is only needed when the cell parameters are variable and the
                analysis is done on-line.

           outpath
                The output path for the frequency computation in the HDF5 file.
                If not given, it defaults to '%s_rdf' % path. If this path
                already exists, it will be removed first.

           When f is None, or when the path does not exist in the HDF5 file, the
           class can be used as an on-line analysis hook for the iterative
           algorithms in yaff.sampling package. This means that the RDF
           is built up as the itertive algorithm progresses. The end option is
           ignored and max_sample is not applicable to an on-line analysis.
        """
        if select0 is not None:
            if len(select0) != len(set(select0)):
                raise ValueError('No duplicates are allowed in select0')
            if len(select0) == 0:
                raise ValueError('select0 can not be an empty list')
        if select1 is not None:
            if len(select1) != len(set(select1)):
                raise ValueError('No duplicates are allowed in select1')
            if len(select1) == 0:
                raise ValueError('select1 can not be an empty list')
        if select0 is not None and select1 is not None and len(select0) + len(
                select1) != len(set(select0) | set(select1)):
            raise ValueError(
                'No overlap is allowed between select0 and select1. If you want to compute and RDF within a set of atoms, omit the select1 argument.'
            )
        if select0 is None and select1 is not None:
            raise ValueError('select1 can not be given without select0.')
        self.rcut = rcut
        self.rspacing = rspacing
        self.select0 = select0
        self.select1 = select1
        self.pairs_sr = self._process_pairs_sr(pairs_sr)
        self.nimage = nimage
        self.nbin = int(self.rcut / self.rspacing)
        self.bins = np.arange(self.nbin + 1) * self.rspacing
        self.d = self.bins[:-1] + 0.5 * self.rspacing
        self.rdf_sum = np.zeros(self.nbin, float)
        self.CN_sum = np.zeros(self.nbin, float)
        if self.pairs_sr is not None:
            self.rdf_sum_sr = np.zeros(self.nbin, float)
        self.nsample = 0
        if outpath is None:
            outpath = pospath + '_rdf'
        analysis_inputs = {
            'pos': AnalysisInput(pospath, poskey),
            'cell': AnalysisInput(cellpath, cellkey, False)
        }
        AnalysisHook.__init__(self, f, start, end, max_sample, step,
                              analysis_inputs, outpath, False)

    def _process_pairs_sr(self, pairs_sr):
        '''Process the short-range pairs

           The following modifications are made to the list of short-range
           pairs:

           - The pairs that do not fit in select0 (and select1) are left out.
           - The list is properly sorted.

           Note that the argument pairs_sr provided to the constructor is not
           modified in-place. It is therefore safe to reuse it for another RDF
           analysis.
        '''
        if pairs_sr is None:
            return None
        elif self.select1 is None:
            index0 = dict((atom0, i0) for i0, atom0 in enumerate(self.select0))
            index1 = index0
        else:
            index0 = dict((atom0, i0) for i0, atom0 in enumerate(self.select0))
            index1 = dict((atom1, i1) for i1, atom1 in enumerate(self.select1))
        my_pairs_sr = []
        for atom0, atom1 in pairs_sr:
            i0 = index0.get(atom0)
            i1 = index1.get(atom1)
            if i0 is None or i1 is None:
                i0 = index0.get(atom1)
                i1 = index1.get(atom0)
            if i0 is None or i1 is None:
                continue
            if self.select1 is None and i0 < i1:
                i0, i1 = i1, i0
            my_pairs_sr.append((i0, i1))
        if len(my_pairs_sr) > 0:
            my_pairs_sr.sort()
            return np.array(my_pairs_sr)

    def _update_rvecs(self, rvecs):
        self.cell = Cell(rvecs)
        if self.cell.nvec != 3:
            raise ValueError(
                'RDF can only be computed for 3D periodic systems.')
        if (2 * self.rcut > self.cell.rspacings * (1 + 2 * self.nimage)).any():
            raise ValueError(
                'The 2*rcut argument should not exceed any of the cell spacings.'
            )

    def configure_online(self, iterative, st_pos, st_cell=None):
        self.natom = iterative.ff.system.natom
        self._update_rvecs(iterative.ff.system.cell.rvecs)

    def configure_offline(self, ds_pos, ds_cell=None):
        if ds_cell is None:
            # In this case, we have a unit cell that does not change shape.
            # It must be configured just once.
            if 'rvecs' in self.f['system']:
                self._update_rvecs(self.f['system/rvecs'][:])
            else:
                self._update_rvecs(None)
        # get the total number of atoms
        self.natom = self.f['system/numbers'].shape[0]

    def init_first(self):
        '''Setup some work arrays'''
        # determine the number of atoms
        if self.select0 is None:
            self.natom0 = self.natom
        else:
            self.natom0 = len(self.select0)
        self.pos0 = np.zeros((self.natom0, 3), float)
        # the number of pairs
        if self.select1 is None:
            self.npair = (self.natom0 * (self.natom0 - 1)) // 2
            self.pos1 = None
        else:
            self.natom1 = len(self.select1)
            self.pos1 = np.zeros((self.natom1, 3), float)
            self.npair = self.natom0 * self.natom1
        # multiply the number of pairs by all images
        self.npair *= (1 + 2 * self.nimage)**3
        # Prepare the output
        self.work = np.zeros(self.npair, float)
        AnalysisHook.init_first(self)
        if self.outg is not None:
            self.outg.create_dataset('rdf', (self.nbin, ), float)
            self.outg.create_dataset('CN', (self.nbin, ), float)
            self.outg['d'] = self.d
            if self.pairs_sr is not None:
                self.outg.create_dataset('rdf_sr', (self.nbin, ), float)

    def read_online(self, st_pos, st_cell=None):
        if st_cell is not None:
            self._update_rvecs(st_cell.value)
        if self.select0 is None:
            self.pos0[:] = st_pos.value
        else:
            self.pos0[:] = st_pos.value[self.select0]
        if self.select1 is not None:
            self.pos1[:] = st_pos.value[self.select1]

    def read_offline(self, i, ds_pos, ds_cell=None):
        if ds_cell is not None:
            self._update_rvecs(np.array(ds_cell[i]))
        if self.select0 is None:
            ds_pos.read_direct(self.pos0, (i, ))
        else:
            ds_pos.read_direct(self.pos0, (i, self.select0))
        if self.select1 is not None:
            ds_pos.read_direct(self.pos1, (i, self.select1))

    def compute_iteration(self):
        self.cell.compute_distances(self.work,
                                    self.pos0,
                                    self.pos1,
                                    nimage=self.nimage)
        counts = np.histogram(self.work, bins=self.bins)[0]
        normalization = self.npair / (
            self.cell.volume *
            (1 + 2 * self.nimage)**3) * (4 * np.pi * self.rspacing) * self.d**2
        self.rdf_sum += counts / normalization
        self.CN_sum += counts / (self.natom0 * 4 * np.pi * self.rspacing *
                                 self.d**2)
        if self.pairs_sr is not None:
            self.cell.compute_distances(self.work[:len(self.pairs_sr)],
                                        self.pos0,
                                        self.pos1,
                                        pairs=self.pairs_sr,
                                        do_include=True)
            counts_sr = np.histogram(self.work[:len(self.pairs_sr)],
                                     bins=self.bins)[0]
            self.rdf_sum_sr += counts_sr / normalization
        self.nsample += 1

    def compute_derived(self):
        # derive the RDF and the CN
        from scipy.integrate import cumtrapz
        self.rdf = self.rdf_sum / self.nsample
        if self.select1 is None:
            self.CN_sum *= (1 +
                            2 * self.nimage)**3 * self.natom0**2 / self.npair
        self.CN = cumtrapz(4 * np.pi * self.d**2 * self.CN_sum / self.nsample,
                           self.d,
                           initial=0.)
        if self.pairs_sr is not None:
            self.rdf_sr = self.rdf_sum_sr / self.nsample
        # store everything in the h5py file
        if self.outg is not None:
            self.outg['rdf'][:] = self.rdf
            self.outg['CN'][:] = self.CN
            if self.pairs_sr is not None:
                self.outg['rdf_sr'][:] = self.rdf_sr

    def plot(self, fn_png='rdf.png'):
        import matplotlib.pyplot as pt
        pt.clf()
        xunit = log.length.conversion
        pt.plot(self.d / xunit, self.rdf, 'k-', drawstyle='steps-mid')
        if self.pairs_sr is not None:
            pt.plot(self.d / xunit, self.rdf_sr, 'r-', drawstyle='steps-mid')
        pt.xlabel('Distance [%s]' % log.length.notation)
        pt.ylabel('RDF')
        pt.xlim(self.bins[0] / xunit, self.bins[-1] / xunit)
        pt.savefig(fn_png)
Beispiel #2
0
class RDF(AnalysisHook):
    def __init__(self, rcut, rspacing, f=None, start=0, end=-1, max_sample=None,
                 step=None, select0=None, select1=None, pairs_sr=None, nimage=0,
                 pospath='trajectory/pos', poskey='pos', cellpath=None,
                 cellkey=None, outpath=None):
        """Computes a radial distribution function (RDF)

           **Argument:**

           rcut
                The cutoff for the RDF analysis. This should be lower than the
                spacing between the primitive cell planes, multiplied by (1+2*nimage).

           rspacing
                The width of the bins to build up the RDF.

           **Optional arguments:**

           f
                An h5.File instance containing the trajectory data. If ``f``
                is not given, or it does not contain the dataset referred to
                with the ``path`` argument, an on-line analysis is carried out.

           start, end, max_sample, step
                arguments to setup the selection of time slices. See
                ``get_slice`` for more information.

           select0
                A list of atom indexes that are considered for the computation
                of the rdf. If not given, all atoms are used.

           select1
                A list of atom indexes that are needed to compute an RDF between
                two disjoint sets of atoms. (If there is some overlap between
                select0 and select1, an error will be raised.) If this is None,
                an 'internal' RDF will be computed for the atoms specified in
                select0.

           pairs_sr
                An array with short-range pairs of atoms (shape K x 2). When
                given, an additional RDFs is generated for the short-range pairs
                (rdf_sr).

           nimage
                The number of cell images to consider in the computation of the
                pair distances. By default, this is zero, meaning that only the
                minimum image convention is used.

           pospath
                The path of the dataset that contains the time dependent data in
                the HDF5 file. The first axis of the array must be the time
                axis. This is only needed for an off-line analysis

           poskey
                In case of an on-line analysis, this is the key of the state
                item that contains the data from which the RDF is derived.

           cellpath
                The path the time-dependent cell vector data. This is only
                needed when the cell parameters are variable and the analysis is
                off-line.

           cellkey
                The key of the stateitem that contains the cell vectors. This
                is only needed when the cell parameters are variable and the
                analysis is done on-line.

           outpath
                The output path for the frequency computation in the HDF5 file.
                If not given, it defaults to '%s_rdf' % path. If this path
                already exists, it will be removed first.

           When f is None, or when the path does not exist in the HDF5 file, the
           class can be used as an on-line analysis hook for the iterative
           algorithms in yaff.sampling package. This means that the RDF
           is built up as the itertive algorithm progresses. The end option is
           ignored and max_sample is not applicable to an on-line analysis.
        """
        if select0 is not None:
            if len(select0) != len(set(select0)):
                raise ValueError('No duplicates are allowed in select0')
            if len(select0) == 0:
                raise ValueError('select0 can not be an empty list')
        if select1 is not None:
            if len(select1) != len(set(select1)):
                raise ValueError('No duplicates are allowed in select1')
            if len(select1) == 0:
                raise ValueError('select1 can not be an empty list')
        if select0 is not None and select1 is not None and len(select0) + len(select1) != len(set(select0) | set(select1)):
            raise ValueError('No overlap is allowed between select0 and select1. If you want to compute and RDF within a set of atoms, omit the select1 argument.')
        if select0 is None and select1 is not None:
            raise ValueError('select1 can not be given without select0.')
        self.rcut = rcut
        self.rspacing = rspacing
        self.select0 = select0
        self.select1 = select1
        self.pairs_sr = self._process_pairs_sr(pairs_sr)
        self.nimage = nimage
        self.nbin = int(self.rcut/self.rspacing)
        self.bins = np.arange(self.nbin+1)*self.rspacing
        self.d = self.bins[:-1] + 0.5*self.rspacing
        self.rdf_sum = np.zeros(self.nbin, float)
        if self.pairs_sr is not None:
            self.rdf_sum_sr = np.zeros(self.nbin, float)
        self.nsample = 0
        if outpath is None:
            outpath = pospath + '_rdf'
        analysis_inputs = {'pos': AnalysisInput(pospath, poskey), 'cell': AnalysisInput(cellpath, cellkey, False)}
        AnalysisHook.__init__(self, f, start, end, max_sample, step, analysis_inputs, outpath, False)

    def _process_pairs_sr(self, pairs_sr):
        '''Process the short-range pairs

           The following modifications are made to the list of short-range
           pairs:

           - The pairs that do not fit in select0 (and select1) are left out.
           - The list is properly sorted.

           Note that the argument pairs_sr provided to the constructor is not
           modified in-place. It is therefore safe to reuse it for another RDF
           analysis.
        '''
        if pairs_sr is None:
            return None
        elif self.select1 is None:
            index0 = dict((atom0, i0) for i0, atom0 in enumerate(self.select0))
            index1 = index0
        else:
            index0 = dict((atom0, i0) for i0, atom0 in enumerate(self.select0))
            index1 = dict((atom1, i1) for i1, atom1 in enumerate(self.select1))
        my_pairs_sr = []
        for atom0, atom1 in pairs_sr:
            i0 = index0.get(atom0)
            i1 = index1.get(atom1)
            if i0 is None or i1 is None:
                i0 = index0.get(atom1)
                i1 = index1.get(atom0)
            if i0 is None or i1 is None:
                continue
            if self.select1 is None and i0 < i1:
                i0, i1 = i1, i0
            my_pairs_sr.append((i0, i1))
        if len(my_pairs_sr) > 0:
            my_pairs_sr.sort()
            return np.array(my_pairs_sr)

    def _update_rvecs(self, rvecs):
        self.cell = Cell(rvecs)
        if self.cell.nvec != 3:
            raise ValueError('RDF can only be computed for 3D periodic systems.')
        if (2*self.rcut > self.cell.rspacings*(1+2*self.nimage)).any():
            raise ValueError('The 2*rcut argument should not exceed any of the cell spacings.')

    def configure_online(self, iterative, st_pos, st_cell=None):
        self.natom = iterative.ff.system.natom
        self._update_rvecs(iterative.ff.system.cell.rvecs)

    def configure_offline(self, ds_pos, ds_cell=None):
        if ds_cell is None:
            # In this case, we have a unit cell that does not change shape.
            # It must be configured just once.
            if 'rvecs' in self.f['system']:
                self._update_rvecs(self.f['system/rvecs'][:])
            else:
                self._update_rvecs(None)
        # get the total number of atoms
        self.natom = self.f['system/numbers'].shape[0]

    def init_first(self):
        '''Setup some work arrays'''
        # determine the number of atoms
        if self.select0 is None:
            self.natom0 = self.natom
        else:
            self.natom0 = len(self.select0)
        self.pos0 = np.zeros((self.natom0, 3), float)
        # the number of pairs
        if self.select1 is None:
            self.npair = (self.natom0*(self.natom0-1))/2
            self.pos1 = None
        else:
            self.natom1 = len(self.select1)
            self.pos1 = np.zeros((self.natom1, 3), float)
            self.npair = self.natom0*self.natom1
        # multiply the number of pairs by all images
        self.npair *= (1 + 2*self.nimage)**3
        # Prepare the output
        self.work = np.zeros(self.npair, float)
        AnalysisHook.init_first(self)
        if self.outg is not None:
            self.outg.create_dataset('rdf', (self.nbin,), float)
            self.outg['d'] = self.d
            if self.pairs_sr is not None:
                self.outg.create_dataset('rdf_sr', (self.nbin,), float)

    def read_online(self, st_pos, st_cell=None):
        if st_cell is not None:
            self._update_rvecs(st_cell.value)
        if self.select0 is None:
            self.pos0[:] = st_pos.value
        else:
            self.pos0[:] = st_pos.value[self.select0]
        if self.select1 is not None:
            self.pos1[:] = st_pos.value[self.select1]

    def read_offline(self, i, ds_pos, ds_cell=None):
        if ds_cell is not None:
            self._update_rvecs(np.array(ds_cell[i]))
        if self.select0 is None:
            ds_pos.read_direct(self.pos0, (i,))
        else:
            ds_pos.read_direct(self.pos0, (i,self.select0))
        if self.select1 is not None:
            ds_pos.read_direct(self.pos1, (i,self.select1))

    def compute_iteration(self):
        self.cell.compute_distances(self.work, self.pos0, self.pos1, nimage=self.nimage)
        counts = np.histogram(self.work, bins=self.bins)[0]
        normalization = (self.npair/(self.cell.volume*(1+2*self.nimage)**3)*(4*np.pi*self.rspacing))*self.d**2
        self.rdf_sum += counts/normalization
        if self.pairs_sr is not None:
            self.cell.compute_distances(self.work[:len(self.pairs_sr)], self.pos0, self.pos1, pairs=self.pairs_sr, do_include=True)
            counts_sr = np.histogram(self.work[:len(self.pairs_sr)], bins=self.bins)[0]
            self.rdf_sum_sr += counts_sr/normalization
        self.nsample += 1

    def compute_derived(self):
        # derive the RDF
        self.rdf = self.rdf_sum/self.nsample
        if self.pairs_sr is not None:
            self.rdf_sr = self.rdf_sum_sr/self.nsample
        # store everything in the h5py file
        if self.outg is not None:
            self.outg['rdf'][:] = self.rdf
            if self.pairs_sr is not None:
                self.outg['rdf_sr'][:] = self.rdf_sr

    def plot(self, fn_png='rdf.png'):
        import matplotlib.pyplot as pt
        pt.clf()
        xunit = log.length.conversion
        pt.plot(self.d/xunit, self.rdf, 'k-', drawstyle='steps-mid')
        if self.pairs_sr is not None:
            pt.plot(self.d/xunit, self.rdf_sr, 'r-', drawstyle='steps-mid')
        pt.xlabel('Distance [%s]' % log.length.notation)
        pt.ylabel('RDF')
        pt.xlim(self.bins[0]/xunit, self.bins[-1]/xunit)
        pt.savefig(fn_png)
Beispiel #3
0
class System(object):
    def __init__(self, numbers, pos, scopes=None, scope_ids=None, ffatypes=None,
                 ffatype_ids=None, bonds=None, rvecs=None, charges=None, radii=None,
                 dipoles=None, radii2=None, masses=None):
        '''
           **Arguments:**

           numbers
                A numpy array with atomic numbers

           pos
                A numpy array (N,3) with atomic coordinates in Bohr.

           **Optional arguments:**

           scopes
                A list with scope names

           scope_ids
                A list of scope indexes that links each atom with an element of
                the scopes list. If this argument is not present, while scopes
                is given, it is assumed that scopes contains a scope name for
                every atom, i.e. that it is a list with length natom. In that
                case, it will be converted automatically to a scopes list
                with only unique name together with a corresponding scope_ids
                array.

           ffatypes
                A list of labels of the force field atom types.

           ffatype_ids
                A list of atom type indexes that links each atom with an element
                of the list ffatypes. If this argument is not present, while
                ffatypes is given, it is assumed that ffatypes contains an
                atom type for every element, i.e. that it is a list with length
                natom. In that case, it will be converted automatically to
                a short ffatypes list with only unique elements (within each
                scope) together with a corresponding ffatype_ids array.

           bonds
                a numpy array (B,2) with atom indexes (counting starts from
                zero) to define the chemical bonds.

           rvecs
                An array whose rows are the unit cell vectors. At most three
                rows are allowed, each containing three Cartesian coordinates.

           charges
                An array of atomic charges

           radii
                An array of atomic radii that determine shape of charge
                distribution
                rho[i]=charges[i]/(sqrt(pi)radii[i]**3)*exp(-(|r-pos[i]|/radii[i])**2)

           dipoles
                An array of atomic dipoles

           radii2
                An array of atomic radii that determine shape of dipole
                distribution
                rho[i]=-(dipoles[i] dot r-pos[i])*2.0/(sqrt(pi)radii2[i]**5)*exp(-(|r-pos[i]|/radii[i])**2)

           masses
                The atomic masses (in atomic units, i.e. m_e)


           Several attributes are derived from the (optional) arguments:

           * ``cell`` contains the rvecs attribute and is an instance of the
             ``Cell`` class.

           * ``neighs1``, ``neighs2`` and ``neighs3`` are dictionaries derived
             from ``bonds`` that contain atoms that are separated 1, 2 and 3
             bonds from a given atom, respectively. This means that i in
             system.neighs3[j] is ``True`` if there are three bonds between
             atoms i and j.
        '''
        if len(numbers.shape) != 1:
            raise ValueError('Argument numbers must be a one-dimensional array.')
        if pos.shape != (len(numbers), 3):
            raise ValueError('The pos array must have Nx3 rows. Mismatch with numbers argument with shape (N,).')
        self.numbers = numbers
        self.pos = pos
        self.ffatypes = ffatypes
        self.ffatype_ids = ffatype_ids
        self.scopes = scopes
        self.scope_ids = scope_ids
        self.bonds = bonds
        self.cell = Cell(rvecs)
        self.charges = charges
        self.radii = radii
        self.dipoles = dipoles
        self.radii2 = radii2
        self.masses = masses
        with log.section('SYS'):
            # report some stuff
            self._init_log()
            # compute some derived attributes
            self._init_derived()

    def _init_log(self):
        if log.do_medium:
            log('Unit cell')
            log.hline()
            log('Number of periodic dimensions: %i' % self.cell.nvec)
            lengths, angles = self.cell.parameters
            names = 'abc'
            for i in xrange(len(lengths)):
                log('Cell parameter %5s: %10s' % (names[i], log.length(lengths[i])))
            names = 'alpha', 'beta', 'gamma'
            for i in xrange(len(angles)):
                log('Cell parameter %5s: %10s' % (names[i], log.angle(angles[i])))
            log.hline()
            log.blank()

    def _init_derived(self):
        if self.bonds is not None:
            self._init_derived_bonds()
        if self.scopes is not None:
            self._init_derived_scopes()
        elif self.scope_ids is not None:
            raise ValueError('The scope_ids only make sense when the scopes argument is given.')
        if self.ffatypes is not None:
            self._init_derived_ffatypes()
        elif self.ffatype_ids is not None:
            raise ValueError('The ffatype_ids only make sense when the ffatypes argument is given.')

    def _init_derived_bonds(self):
        # 1-bond neighbors
        self.neighs1 = dict((i,set([])) for i in xrange(self.natom))
        for i0, i1 in self.bonds:
            self.neighs1[i0].add(i1)
            self.neighs1[i1].add(i0)
        # 2-bond neighbors
        self.neighs2 = dict((i,set([])) for i in xrange(self.natom))
        for i0, n0 in self.neighs1.iteritems():
            for i1 in n0:
                for i2 in self.neighs1[i1]:
                    # Require that there are no shorter paths than two bonds between
                    # i0 and i2. Also avoid duplicates.
                    if i2 > i0 and i2 not in self.neighs1[i0]:
                        self.neighs2[i0].add(i2)
                        self.neighs2[i2].add(i0)
        # 3-bond neighbors
        self.neighs3 = dict((i,set([])) for i in xrange(self.natom))
        for i0, n0 in self.neighs1.iteritems():
            for i1 in n0:
                for i3 in self.neighs2[i1]:
                    # Require that there are no shorter paths than three bonds
                    # between i0 and i3. Also avoid duplicates.
                    if i3 != i0 and i3 not in self.neighs1[i0] and i3 not in self.neighs2[i0]:
                        self.neighs3[i0].add(i3)
                        self.neighs3[i3].add(i0)
        # report some basic stuff on screen
        if log.do_medium:
            log('Analysis of the bonds:')
            bond_types = {}
            for i0, i1 in self.bonds:
                key = tuple(sorted([self.numbers[i0], self.numbers[i1]]))
                bond_types[key] = bond_types.get(key, 0) + 1
            log.hline()
            log(' First   Second   Count')
            for (num0, num1), count in sorted(bond_types.iteritems()):
                log('%6i   %6i   %5i' % (num0, num1, count))
            log.hline()
            log.blank()

            log('Analysis of the neighbors:')
            log.hline()
            log('Number of first neighbors:  %6i' % (sum(len(n) for n in self.neighs1.itervalues())/2))
            log('Number of second neighbors: %6i' % (sum(len(n) for n in self.neighs2.itervalues())/2))
            log('Number of third neighbors:  %6i' % (sum(len(n) for n in self.neighs3.itervalues())/2))
            # Collect all types of 'environments' for each element. This is
            # useful to double check the bonds
            envs = {}
            for i0 in xrange(self.natom):
                num0 = self.numbers[i0]
                nnums = tuple(sorted(self.numbers[i1] for i1 in self.neighs1[i0]))
                key = (num0, nnums)
                envs[key] = envs.get(key, 0)+1
            # Print the environments on screen
            log.hline()
            log('Element   Neighboring elements   Count')
            for (num0, nnums), count in sorted(envs.iteritems()):
                log('%7i   %20s   %5i' % (num0, ','.join(str(num1) for num1 in nnums), count))
            log.hline()
            log.blank()


    def _init_derived_scopes(self):
        if self.scope_ids is None:
            if len(self.scopes) != self.natom:
                raise TypeError('When the scope_ids are derived automatically, the length of the scopes list must match the number of atoms.')
            lookup = {}
            scopes = []
            self.scope_ids = np.zeros(self.natom, int)
            for i in xrange(self.natom):
                scope = self.scopes[i]
                scope_id = lookup.get(scope)
                if scope_id is None:
                    scope_id = len(scopes)
                    scopes.append(scope)
                    lookup[scope] = scope_id
                self.scope_ids[i] = scope_id
            self.scopes = scopes
        for scope in self.scopes:
            check_name(scope)
        # check the range of the ids
        if self.scope_ids.min() != 0 or self.scope_ids.max() != len(self.scopes)-1:
            raise ValueError('The ffatype_ids have incorrect bounds.')
        if log.do_medium:
            log('The following scopes are present in the system:')
            log.hline()
            log('                 Scope   ID   Number of atoms')
            log.hline()
            for scope_id, scope in enumerate(self.scopes):
                log('%22s  %3i       %3i' % (scope, scope_id, (self.scope_ids==scope_id).sum()))
            log.hline()
            log.blank()

    def _init_derived_ffatypes(self):
        if self.ffatype_ids is None:
            if len(self.ffatypes) != self.natom:
                raise TypeError('When the ffatype_ids are derived automatically, the length of the ffatypes list must match the number of atoms.')
            lookup = {}
            ffatypes = []
            self.ffatype_ids = np.zeros(self.natom, int)
            for i in xrange(self.natom):
                if self.scope_ids is None:
                    ffatype = self.ffatypes[i]
                    key = ffatype, None
                else:
                    scope_id = self.scope_ids[i]
                    ffatype = self.ffatypes[i]
                    key = ffatype, scope_id
                ffatype_id = lookup.get(key)
                if ffatype_id is None:
                    ffatype_id = len(ffatypes)
                    ffatypes.append(ffatype)
                    lookup[key] = ffatype_id
                self.ffatype_ids[i] = ffatype_id
            self.ffatypes = ffatypes
        for ffatype in self.ffatypes:
            check_name(ffatype)
        # check the range of the ids
        if self.ffatype_ids.min() != 0 or self.ffatype_ids.max() != len(self.ffatypes)-1:
            raise ValueError('The ffatype_ids have incorrect bounds.')
        # differentiate ffatype_ids if the same ffatype_id is used in different
        # scopes
        if self.scopes is not None:
            self.ffatype_id_to_scope_id = {}
            fixed_fids = {}
            for i in xrange(self.natom):
                fid = self.ffatype_ids[i]
                sid = self.ffatype_id_to_scope_id.get(fid)
                if sid is None:
                    self.ffatype_id_to_scope_id[fid] = self.scope_ids[i]
                elif sid != self.scope_ids[i]:
                    # We found the same ffatype_id in a different scope_id. This
                    # must be fixed. First check if we have already a new
                    # scope_id ready
                    sid = self.scope_ids[i]
                    new_fid = fixed_fids.get((sid, fid))
                    if new_fid is None:
                        # No previous new fid create, do it now.
                        new_fid = len(self.ffatypes)
                        # Copy the ffatype label
                        self.ffatypes.append(self.ffatypes[fid])
                        # Keep track of the new fid
                        fixed_fids[(sid, fid)] = new_fid
                        if log.do_warning:
                            log.warn('Atoms with type ID %i in scope %s were changed to type ID %i.' % (fid, self.scopes[sid], new_fid))
                    # Apply the new fid
                    self.ffatype_ids[i] = new_fid
                    self.ffatype_id_to_scope_id[new_fid] = sid
        # Turn the ffatypes in the scopes into array
        if self.ffatypes is not None:
            self.ffatypes = np.array(self.ffatypes, copy=False)
        if self.scopes is not None:
            self.scopes = np.array(self.scopes, copy=False)
        # check the range of the ids
        if self.ffatype_ids.min() != 0 or self.ffatype_ids.max() != len(self.ffatypes)-1:
            raise ValueError('The ffatype_ids have incorrect bounds.')
        if log.do_medium:
            log('The following atom types are present in the system:')
            log.hline()
            if self.scopes is None:
                log('             Atom type   ID   Number of atoms')
                log.hline()
                for ffatype_id, ffatype in enumerate(self.ffatypes):
                    log('%22s  %3i       %3i' % (ffatype, ffatype_id, (self.ffatype_ids==ffatype_id).sum()))
            else:
                log('                 Scope              Atom type   ID   Number of atoms')
                log.hline()
                for ffatype_id, ffatype in enumerate(self.ffatypes):
                    scope = self.scopes[self.ffatype_id_to_scope_id[ffatype_id]]
                    log('%22s %22s  %3i       %3i' % (scope, ffatype, ffatype_id, (self.ffatype_ids==ffatype_id).sum()))
            log.hline()
            log.blank()

    def _get_natom(self):
        """The number of atoms"""
        return len(self.pos)

    natom = property(_get_natom)

    def _get_nffatype(self):
        """The number of atom types"""
        return len(self.ffatypes)

    nffatype = property(_get_nffatype)

    def _get_nbond(self):
        '''The number of bonds'''
        if self.bonds is None:
            return 0
        else:
            return len(self.bonds)

    nbond = property(_get_nbond)

    @classmethod
    def from_file(cls, *fns, **user_kwargs):
        """Construct a new System instance from one or more files

           **Arguments:**

           fn1, fn2, ...
                A list of filenames that are read in order. Information in later
                files overrides information in earlier files.

           **Optional arguments:**

           Any argument from the default constructor ``__init__``. These must be
           given with keywords.

           **Supported file formats**

           .xyz
                Standard Cartesian coordinates file (in angstroms). Atomic
                positions and atomic numbers are read from this file. If the
                title consists of 3, 6 or 9 numbers, each group of three numbers
                is interpreted as a cell vector (in angstroms). A guess of the
                bonds will be made based on inter-atomic distances.

           .psf
                Atom types and bonds are read from this file

           .chk
                Internal text-based checkpoint format. It just contains a
                dictionary with the constructor arguments.
        """
        with log.section('SYS'):
            kwargs = {}
            for fn in fns:
                if fn.endswith('.xyz'):
                    from molmod import Molecule
                    mol = Molecule.from_file(fn)
                    kwargs['numbers'] = mol.numbers.copy()
                    kwargs['pos'] = mol.coordinates.copy()
                elif fn.endswith('.psf'):
                    from molmod.io import PSFFile
                    psf = PSFFile(fn)
                    kwargs['ffatypes'] = psf.atom_types
                    kwargs['bonds'] = np.array(psf.bonds, copy=False)
                    kwargs['charges'] = np.array(psf.charges, copy=False)
                elif fn.endswith('.chk'):
                    from molmod.io import load_chk
                    allowed_keys = [
                        'numbers', 'pos', 'scopes', 'scope_ids', 'ffatypes',
                        'ffatype_ids', 'bonds', 'rvecs', 'charges', 'radii',
                        'dipoles','radii2','masses',
                    ]
                    for key, value in load_chk(fn).iteritems():
                        if key in allowed_keys:
                            kwargs.update({key: value})
                elif fn.endswith('.h5'):
                    with h5.File(fn, 'r') as f:
                        return cls.from_hdf5(f)
                else:
                    raise IOError('Can not read from file \'%s\'.' % fn)
                if log.do_high:
                    log('Read system parameters from %s.' % fn)
            kwargs.update(user_kwargs)
        return cls(**kwargs)

    @classmethod
    def from_hdf5(cls, f):
        '''Create a system from an HDF5 file/group containing a system group

           **Arguments:**

           f
                An open h5.File object with a system group. The system group
                must at least contain a numbers and pos dataset.
        '''
        sgrp = f['system']
        kwargs = {
            'numbers': sgrp['numbers'][:],
            'pos': sgrp['pos'][:],
        }
        for key in 'scopes', 'scope_ids', 'ffatypes', 'ffatype_ids', 'bonds', 'rvecs', 'charges', 'masses':
            if key in sgrp:
                kwargs[key] = sgrp[key][:]
        if log.do_high:
            log('Read system parameters from %s.' % f.filename)
        return cls(**kwargs)

    def to_file(self, fn):
        """Write the system to a file

           **Arguments:**

           fn
                The file to write to.

           Supported formats are:

           chk
                Internal human-readable checkpoint format. This format includes
                all the information of a system object. All data are stored in
                atomic units.

           h5
                Internal binary checkpoint format. This format includes
                all the information of a system object. All data are stored in
                atomic units.

           xyz
                A simple file with atomic positions and elements. Coordinates
                are written in Angstroms.
        """
        if fn.endswith('.chk'):
            from molmod.io import dump_chk
            dump_chk(fn, {
                'numbers': self.numbers,
                'pos': self.pos,
                'ffatypes': self.ffatypes,
                'ffatype_ids': self.ffatype_ids,
                'scopes': self.scopes,
                'scope_ids': self.scope_ids,
                'bonds': self.bonds,
                'rvecs': self.cell.rvecs,
                'charges': self.charges,
                'masses': self.masses,
            })
        elif fn.endswith('.h5'):
            with h5.File(fn, 'w') as f:
                self.to_hdf5(f)
        elif fn.endswith('.xyz'):
            from molmod.io import XYZWriter
            from molmod.periodic import periodic
            xyz_writer = XYZWriter(fn, [periodic[n].symbol for n in self.numbers])
            xyz_writer.dump(str(self), self.pos)
        else:
            raise NotImplementedError('The extension of %s does not correspond to any known format.' % fn)
        if log.do_high:
            with log.section('SYS'):
                log('Wrote system to %s.' % fn)

    def to_hdf5(self, f):
        """Write the system to a HDF5 file.

           **Arguments:**

           f
                A Writable h5.File object.
        """
        if 'system' in f:
            raise ValueError('The HDF5 file already contains a system description.')
        sgrp = f.create_group('system')
        sgrp.create_dataset('numbers', data=self.numbers)
        sgrp.create_dataset('pos', data=self.pos)
        if self.scopes is not None:
            sgrp.create_dataset('scopes', data=self.scopes, dtype='a22')
            sgrp.create_dataset('scope_ids', data=self.scope_ids)
        if self.ffatypes is not None:
            sgrp.create_dataset('ffatypes', data=self.ffatypes, dtype='a22')
            sgrp.create_dataset('ffatype_ids', data=self.ffatype_ids)
        if self.bonds is not None:
            sgrp.create_dataset('bonds', data=self.bonds)
        if self.cell.nvec > 0:
            sgrp.create_dataset('rvecs', data=self.cell.rvecs)
        if self.charges is not None:
            sgrp.create_dataset('charges', data=self.charges)
        if self.masses is not None:
            sgrp.create_dataset('masses', data=self.masses)


    def get_scope(self, index):
        """Return the of the scope (string) of atom with given index"""
        return self.scopes[self.scope_ids[index]]

    def get_ffatype(self, index):
        """Return the of the ffatype (string) of atom with given index"""
        return self.ffatypes[self.ffatype_ids[index]]

    def get_indexes(self, rule):
        """Return the atom indexes that match the filter ``rule``

           ``rule`` can be a function that accepts two arguments: system and an
           atom index and that returns True of the atom with index i is of a
           given type. On the other hand ``rule`` can be an ATSELECT string that
           defines the atoms of interest.

           A list of atom indexes is returned.
        """
        if isinstance(rule, basestring):
            rule = atsel_compile(rule)
        return np.array([i for i in xrange(self.natom) if rule(self, i)])

    def iter_bonds(self):
        """Iterate over all bonds."""
        if self.bonds is not None:
            for i1, i2 in self.bonds:
                yield i1, i2

    def iter_angles(self):
        """Iterative over all possible valence angles.

           This routine is based on the attribute ``bonds``.
        """
        if self.bonds is not None:
            for i1 in xrange(self.natom):
                for i0 in self.neighs1[i1]:
                    for i2 in self.neighs1[i1]:
                        if i0 > i2:
                            yield i0, i1, i2

    def iter_dihedrals(self):
        """Iterative over all possible dihedral angles.

           This routine is based on the attribute ``bonds``.
        """
        if self.bonds is not None:
            for i1, i2 in self.bonds:
                for i0 in self.neighs1[i1]:
                    if i0==i2: continue
                    for i3 in self.neighs1[i2]:
                        if i1==i3: continue
                        if i0==i3: continue
                        yield i0, i1, i2, i3

    def detect_bonds(self, exceptions=None):
        """Initialize the ``bonds`` attribute based on inter-atomic distances

           **Optional argument:**

           exceptions:
                Specify custom threshold for certain pairs of elements. This
                must be a dictionary with ((num0, num1), threshold) as items.

           For each pair of elements, a distance threshold is used to detect
           bonded atoms. The distance threshold is based on a database of known
           bond lengths. If the database does not contain a record for the given
           element pair, the threshold is based on the sum of covalent radii.
        """
        with log.section('SYS'):
            from molmod.bonds import bonds
            if self.bonds is not None:
                if log.do_warning:
                    log.warn('Overwriting existing bonds.')
            work = np.zeros((self.natom*(self.natom-1))/2, float)
            self.cell.compute_distances(work, self.pos)
            ishort = (work < bonds.max_length*1.01).nonzero()[0]
            new_bonds = []
            for i in ishort:
                i0, i1 = _unravel_triangular(i)
                n0 = self.numbers[i0]
                n1 = self.numbers[i1]
                if exceptions is not None:
                    threshold = exceptions.get((n0, n1))
                    if threshold is None and n0!=n1:
                        threshold = exceptions.get((n1, n0))
                    if threshold is not None:
                        if work[i] < threshold:
                            new_bonds.append([i0, i1])
                        continue
                if bonds.bonded(n0, n1, work[i]):
                    new_bonds.append([i0, i1])
            self.bonds = np.array(new_bonds)
            self._init_derived_bonds()

    def detect_ffatypes(self, rules):
        """Initialize the ``ffatypes`` attribute based on ATSELECT rules.

           **Argument:**

           rules
                A list of (ffatype, rule) pairs that will be used to initialize
                the attributes ``self.ffatypes`` and ``self.ffatype_ids``.

           If the system already has FF atom types, they will be overwritten.
        """
        with log.section('SYS'):
            # Give warning if needed
            if self.ffatypes is not None:
                if log.do_warning:
                    log.warn('Overwriting existing FF atom types.')
            # Compile all the rules
            my_rules = []
            for ffatype, rule in rules:
                check_name(ffatype)
                if isinstance(rule, basestring):
                    rule = atsel_compile(rule)
                my_rules.append((ffatype, rule))
            # Use the rules to detect the atom types
            lookup = {}
            self.ffatypes = []
            self.ffatype_ids = np.zeros(self.natom, int)
            for i in xrange(self.natom):
                my_ffatype = None
                for ffatype, rule in my_rules:
                    if rule(self, i):
                        my_ffatype = ffatype
                        break
                if my_ffatype is None:
                    raise ValueError('Could not detect FF atom type of atom %i.' % i)
                ffatype_id = lookup.get(my_ffatype)
                if ffatype_id is None:
                    ffatype_id = len(lookup)
                    self.ffatypes.append(my_ffatype)
                    lookup[my_ffatype] = ffatype_id
                self.ffatype_ids[i] = ffatype_id
            # Make sure all is done well ...
            self._init_derived_ffatypes()

    def set_standard_masses(self):
        """Initialize the ``masses`` attribute based on the atomic numbers."""
        with log.section('SYS'):
            from molmod.periodic import periodic
            if self.masses is not None:
                if log.do_warning:
                    log.warn('Overwriting existing masses with default masses.')
            self.masses = np.array([periodic[n].mass for n in self.numbers])

    def align_cell(self, lcs=None, swap=True):
        """Align the unit cell with respect to the Cartesian Axes frame

           **Optional Arguments:**

           lcs
                The linear combinations of the unit cell that must get aligned.
                This is a 2x3 array, where each row represents a linear
                combination of cell vectors. The first row is for alignment with
                the x-axis, second for the z-axis. The default value is::

                    np.array([
                        [1, 0, 0],
                        [0, 0, 1],
                    ])

           swap
                By default, the first alignment is done with the z-axis, then
                with the x-axis. The order is reversed when swap is set to
                False.

           The alignment of the first linear combination is always perfect. The
           alignment of the second linear combination is restricted to a plane.
           The cell is always made right-handed. The coordinates are also
           rotated with respect to the origin, but never inverted.

           The attributes of the system are modified in-place. Note that this
           method only works on 3D periodic systems.
        """
        from molmod import Rotation, deg
        # define the target
        target = np.array([
            [1, 0, 0],
            [0, 0, 1],
        ])

        # default value for linear combination
        if lcs is None:
            lcs = target.copy()

        # The starting values
        pos = self.pos
        rvecs = self.cell.rvecs.copy()
        if rvecs.shape != (3,3):
            raise TypeError('The align_cell method only supports 3D periodic systems.')

        # Optionally swap a cell vector if the cell is not right-handed.
        if np.linalg.det(rvecs) < 0:
            # Find a reasonable vector to swap...
            index = rvecs.sum(axis=1).argmin()
            rvecs[index] *= -1

        # Define the source
        source = np.dot(lcs, rvecs)

        # Do the swapping
        if swap:
            target = target[::-1]
            source = source[::-1]

        # auxiliary function
        def get_angle_axis(t, s):
            cos = np.dot(s, t)/np.linalg.norm(s)/np.linalg.norm(t)
            angle = np.arccos(np.clip(cos, -1, 1))
            axis = np.cross(s, t)
            return angle, axis

        # first alignment
        angle, axis = get_angle_axis(target[0], source[0])
        if np.linalg.norm(axis) > 0:
            r1 = Rotation.from_properties(angle, axis, False)
            pos = r1*pos
            rvecs = r1*rvecs
            source = r1*source

        # second alignment
        # Make sure the source is orthogonal to target[0]
        s1p = source[1] - target[0]*np.dot(target[0], source[1])
        angle, axis = get_angle_axis(target[1], s1p)
        r2 = Rotation.from_properties(angle, axis, False)
        pos = r2*pos
        rvecs = r2*rvecs

        # assign
        self.pos = pos
        self.cell = Cell(rvecs)

    def supercell(self, *reps):
        """Return a supercell of the system.

           **Arguments:**

           reps
                An array with repetitions, which must have the same number of
                elements as the number of cell vectors.

           If this method is called with a non-periodic system, a TypeError is
           raised.
        """
        if self.cell.nvec == 0:
            raise TypeError('Can not create a supercell of a non-periodic system.')
        if self.cell.nvec != len(reps):
            raise TypeError('The number of repetitions must match the number of cell vectors.')
        if not isinstance(reps, tuple):
            raise TypeError('The reps argument must be a tuple')
        # A dictionary with new arguments for the construction of the supercell
        new_args = {}

        # A) No repetitions
        if self.ffatypes is not None:
            new_args['ffatypes'] = self.ffatypes.copy()
        if self.scopes is not None:
            new_args['scopes'] = self.scopes.copy()

        # B) Simple repetitions
        rep_all = np.product(reps)
        for attrname in 'numbers', 'ffatype_ids', 'scope_ids', 'charges', 'radii', 'radii2', 'masses':
            value = getattr(self, attrname)
            if value is not None:
                new_args[attrname] = np.tile(value, rep_all)
        attrname = 'dipoles'
        value = getattr(self, attrname)
        if value is not None:
            new_args[attrname] = np.tile(value, (rep_all, 1))

        # C) Cell vectors
        new_args['rvecs'] = self.cell.rvecs*np.array(reps)[:,None]

        # D) Atom positions
        new_pos = np.zeros((self.natom*rep_all, 3), float)
        start = 0
        for iimage in np.ndindex(reps):
            stop = start+self.natom
            new_pos[start:stop] = self.pos + np.dot(iimage, self.cell.rvecs)
            start = stop
        new_args['pos'] = new_pos

        if self.bonds is not None:
            # E) Bonds
            # E.1) A function that translates a set of image indexes and an old atom
            # index into a new atom index
            offsets = {}
            start = 0
            for iimage in np.ndindex(reps):
                offsets[iimage] = start
                start += self.natom
            def to_new_atom_index(iimage, i):
                return offsets[iimage] + i

            # E.2) Construct extended bond information: for each bond, also keep
            # track of periodic image it connects to. Note that this information
            # is implicit in yaff, and derived using the minimum image convention.
            rel_iimage = {}
            for ibond in xrange(len(self.bonds)):
                i0, i1 = self.bonds[ibond]
                delta = self.pos[i0] - self.pos[i1]
                frac = np.dot(self.cell.gvecs, delta)
                rel_iimage[ibond] = np.ceil(frac-0.5)

            # E.3) Create the new bonds
            new_bonds = np.zeros((len(self.bonds)*rep_all,2), int)
            counter = 0
            for iimage0 in np.ndindex(reps):
                for ibond in xrange(len(self.bonds)):
                    i0, i1 = self.bonds[ibond]
                    # Translate i0 to the new index.
                    j0 = to_new_atom_index(iimage0, i0)
                    # Also translate i1 to the new index. This is a bit more tricky.
                    # The difficult case occurs when the bond between i0 and i1
                    # connects different periodic images. In that case, the change
                    # in periodic image must be taken into account.
                    iimage1 = tuple((iimage0[c] + rel_iimage[ibond][c]) % reps[c] for c in xrange(len(reps)))
                    j1 = to_new_atom_index(iimage1, i1)
                    new_bonds[counter,0] = j0
                    new_bonds[counter,1] = j1
                    counter += 1
            new_args['bonds'] = new_bonds

        # Done
        return System(**new_args)

    def remove_duplicate(self, threshold=0.1):
        '''Return a system object in which the duplicate atoms and bonds are removed.

           **Optional argument:**

           threshold
                The minimum distance between two atoms that are supposed to be
                different.

           When it makes sense, properties of overlapping atoms are averaged
           out. In other cases, the atom with the lowest index in a cluster of
           overlapping atoms defines the new value of a property.
        '''
        # compute distances
        ndist = (self.natom*(self.natom-1))/2
        if ndist == 0: # single atom systems, go home ...
            return
        dists = np.zeros(ndist)
        self.cell.compute_distances(dists, self.pos)

        # find clusters of overlapping atoms
        from molmod import ClusterFactory
        cf = ClusterFactory()
        counter = 0
        for i0 in xrange(self.natom):
            for i1 in xrange(i0):
                if dists[counter] < threshold:
                    cf.add_related(i0, i1)
                counter += 1
        clusters = [c.items for c in cf.get_clusters()]

        # make a mapping from new to old atoms
        newold = {}
        oldnew = {}
        counter = 0
        for cluster in clusters: # all merged atoms come first
            newold[counter] = sorted(cluster)
            for item in cluster:
                oldnew[item] = counter
            counter += 1
        if len(clusters) > 0:
            old_reduced = set.union(*clusters)
        else:
            old_reduced = []
        for item in xrange(self.natom): # all remaining atoms follow
            if item not in old_reduced:
                newold[counter] = [item]
                oldnew[item] = counter
                counter += 1
        natom = len(newold)

        def reduce_int_array(old):
            if old is None:
                return None
            else:
                new = np.zeros(natom, old.dtype)
                for inew, iolds in newold.iteritems():
                    new[inew] = old[iolds[0]]
                return new

        def reduce_float_array(old):
            if old is None:
                return None
            else:
                new = np.zeros(natom, old.dtype)
                for inew, iolds in newold.iteritems():
                    new[inew] = old[iolds].mean()
                return new

        def reduce_float_matrix(old):
            '''Reduce array with dim=2'''
            if old is None:
                return None
            else:
                new = np.zeros((natom,np.shape(old)[1]), old.dtype)
                for inew, iolds in newold.iteritems():
                    new[inew] = old[iolds].mean(axis=0)
                return new

        # trivial cases
        numbers = reduce_int_array(self.numbers)
        scope_ids = reduce_int_array(self.scope_ids)
        ffatype_ids = reduce_int_array(self.ffatype_ids)
        charges = reduce_float_array(self.charges)
        radii = reduce_float_array(self.radii)
        dipoles = reduce_float_matrix(self.dipoles)
        radii2 = reduce_float_array(self.radii2)
        masses = reduce_float_array(self.masses)

        # create averaged positions
        pos = np.zeros((natom, 3), float)
        for inew, iolds in newold.iteritems():
            # move to the same image
            oldposs = self.pos[iolds].copy()
            assert oldposs.ndim == 2
            ref = oldposs[0]
            for oldpos in oldposs[1:]:
                delta = oldpos-ref
                self.cell.mic(delta)
                oldpos[:] = delta+ref
            # compute mean position
            pos[inew] = oldposs.mean(axis=0)

        # create reduced list of bonds
        if self.bonds is None:
            bonds = None
        else:
            bonds = set((oldnew[ia], oldnew[ib]) for ia, ib in self.bonds)
            bonds = np.array([bond for bond in bonds])

        return self.__class__(numbers, pos, self.scopes, scope_ids, self.ffatypes, ffatype_ids, bonds, self.cell.rvecs, charges, radii, dipoles, radii2, masses)

    def subsystem(self, indexes):
        '''Return a System instance in which only the given atom are retained.'''

        def reduce_array(old):
            if old is None:
                return None
            else:
                new = np.zeros((len(indexes),) + old.shape[1:], old.dtype)
                for inew, iold in enumerate(indexes):
                    new[inew] = old[iold]
                return new

        def reduce_scopes():
            if self.scopes is None:
                return None
            else:
                return [self.get_scope(i) for i in indexes]

        def reduce_ffatypes():
            if self.ffatypes is None:
                return None
            else:
                return [self.get_ffatype(i) for i in indexes]

        def reduce_bonds(old):
            translation = dict((iold, inew) for inew, iold in enumerate(indexes))
            new = []
            for old0, old1 in old:
                new0 = translation.get(old0)
                new1 = translation.get(old1)
                if not (new0 is None or new1 is None):
                    new.append([new0, new1])
            return new

        return System(
            numbers=reduce_array(self.numbers),
            pos=reduce_array(self.pos),
            scopes=reduce_scopes(),
            ffatypes=reduce_ffatypes(),
            bonds=reduce_bonds(self.bonds),
            rvecs=self.cell.rvecs,
            charges=reduce_array(self.charges),
            radii=reduce_array(self.radii),
            dipoles=reduce_array(self.dipoles),
            radii2=reduce_array(self.radii2),
            masses=reduce_array(self.masses),
        )

    def cut_bonds(self, indexes):
        '''Remove all bonds of a fragment with the remainder of the system;

           **Arguments:**

           indexes
                The atom indexes in the fragment
        '''
        new_bonds = []
        indexes = set(indexes)
        for i0, i1 in self.bonds:
            if not ((i0 in indexes) ^ (i1 in indexes)):
                new_bonds.append([i0, i1])
        self.bonds = np.array(new_bonds)

    def to_file(self, fn):
        """Write the system to a file

           **Arguments:**

           fn
                The file to write to.

           Supported formats are:

           chk
                Internal human-readable checkpoint format. This format includes
                all the information of a system object. All data are stored in
                atomic units.

           h5
                Internal binary checkpoint format. This format includes
                all the information of a system object. All data are stored in
                atomic units.

           xyz
                A simple file with atomic positions and elements. Coordinates
                are written in Angstroms.
        """
        if fn.endswith('.chk'):
            from molmod.io import dump_chk
            dump_chk(fn, {
                'numbers': self.numbers,
                'pos': self.pos,
                'ffatypes': self.ffatypes,
                'ffatype_ids': self.ffatype_ids,
                'scopes': self.scopes,
                'scope_ids': self.scope_ids,
                'bonds': self.bonds,
                'rvecs': self.cell.rvecs,
                'charges': self.charges,
                'radii': self.radii,
                'dipoles': self.dipoles,
                'radii2': self.radii2,
                'masses': self.masses,
            })
        elif fn.endswith('.h5'):
            with h5.File(fn, 'w') as f:
                self.to_hdf5(f)
        elif fn.endswith('.xyz'):
            from molmod.io import XYZWriter
            from molmod.periodic import periodic
            xyz_writer = XYZWriter(fn, [periodic[n].symbol for n in self.numbers])
            xyz_writer.dump(str(self), self.pos)
        else:
            raise NotImplementedError('The extension of %s does not correspond to any known format.' % fn)
        if log.do_high:
            with log.section('SYS'):
                log('Wrote system to %s.' % fn)

    def to_hdf5(self, f):
        """Write the system to a HDF5 file.

           **Arguments:**

           f
                A Writable h5.File object.
        """
        if 'system' in f:
            raise ValueError('The HDF5 file already contains a system description.')
        sgrp = f.create_group('system')
        sgrp.create_dataset('numbers', data=self.numbers)
        sgrp.create_dataset('pos', data=self.pos)
        if self.scopes is not None:
            sgrp.create_dataset('scopes', data=self.scopes, dtype='a22')
            sgrp.create_dataset('scope_ids', data=self.scope_ids)
        if self.ffatypes is not None:
            sgrp.create_dataset('ffatypes', data=self.ffatypes, dtype='a22')
            sgrp.create_dataset('ffatype_ids', data=self.ffatype_ids)
        if self.bonds is not None:
            sgrp.create_dataset('bonds', data=self.bonds)
        if self.cell.nvec > 0:
            sgrp.create_dataset('rvecs', data=self.cell.rvecs)
        if self.charges is not None:
            sgrp.create_dataset('charges', data=self.charges)
        if self.radii is not None:
            sgrp.create_dataset('radii', data=self.radii)
        if self.dipoles is not None:
            sgrp.create_dataset('dipoles', data=self.dipoles)
        if self.radii2 is not None:
            sgrp.create_dataset('radii2', data=self.radii2)
        if self.masses is not None:
            sgrp.create_dataset('masses', data=self.masses)
Beispiel #4
0
class System(object):
    def __init__(self,
                 numbers,
                 pos,
                 scopes=None,
                 scope_ids=None,
                 ffatypes=None,
                 ffatype_ids=None,
                 bonds=None,
                 rvecs=None,
                 charges=None,
                 radii=None,
                 valence_charges=None,
                 dipoles=None,
                 radii2=None,
                 masses=None):
        r'''Initialize a System object.

           **Arguments:**

           numbers
                A numpy array with atomic numbers

           pos
                A numpy array (N,3) with atomic coordinates in Bohr.

           **Optional arguments:**

           scopes
                A list with scope names

           scope_ids
                A list of scope indexes that links each atom with an element of
                the scopes list. If this argument is not present, while scopes
                is given, it is assumed that scopes contains a scope name for
                every atom, i.e. that it is a list with length natom. In that
                case, it will be converted automatically to a scopes list
                with only unique name together with a corresponding scope_ids
                array.

           ffatypes
                A list of labels of the force field atom types.

           ffatype_ids
                A list of atom type indexes that links each atom with an element
                of the list ffatypes. If this argument is not present, while
                ffatypes is given, it is assumed that ffatypes contains an
                atom type for every element, i.e. that it is a list with length
                natom. In that case, it will be converted automatically to
                a short ffatypes list with only unique elements (within each
                scope) together with a corresponding ffatype_ids array.

           bonds
                a numpy array (B,2) with atom indexes (counting starts from
                zero) to define the chemical bonds.

           rvecs
                An array whose rows are the unit cell vectors. At most three
                rows are allowed, each containing three Cartesian coordinates.

           charges
                An array of atomic charges

           radii
                An array of atomic radii, :math:`R_{A,c}`, that determine shape of the atomic
                charge distribution:

                .. math::

                    \rho_{A,c}(\mathbf{r}) = \frac{q_A}{\pi^{3/2}R_{A,c}^3} \exp\left(
                    -\frac{|r - \mathbf{R}_A|^2}{R_{A,c}^2}
                    \right)

           valence_charges
                In case a point-core + distribute valence charge is used, this
                vector contains the valence charges. The core charges can be
                computed by subtracting the valence charges from the net
                charges.

           dipoles
                An array of atomic dipoles

           radii2
                An array of atomic radii, :math:`R_{A,d}`, that determine shape of the
                atomic dipole distribution:

                .. math::

                   \rho_{A,d}(\mathbf{r}) = -2\frac{\mathbf{d}_A \cdot (\mathbf{r} - \mathbf{R}_A)}{
                   \sqrt{\pi} R_{A,d}^5
                   }\exp\left(
                    -\frac{|r - \mathbf{R}_A|^2}{R_{A,d}^2}
                    \right)

           masses
                The atomic masses (in atomic units, i.e. m_e)


           Several attributes are derived from the (optional) arguments:

           * ``cell`` contains the rvecs attribute and is an instance of the
             ``Cell`` class.

           * ``neighs1``, ``neighs2`` and ``neighs3`` are dictionaries derived
             from ``bonds`` that contain atoms that are separated 1, 2 and 3
             bonds from a given atom, respectively. This means that i in
             system.neighs3[j] is ``True`` if there are three bonds between
             atoms i and j.
        '''
        if len(numbers.shape) != 1:
            raise ValueError(
                'Argument numbers must be a one-dimensional array.')
        if pos.shape != (len(numbers), 3):
            raise ValueError(
                'The pos array must have Nx3 rows. Mismatch with numbers argument with shape (N,).'
            )
        self.numbers = numbers
        self.pos = pos
        self.ffatypes = ffatypes
        self.ffatype_ids = ffatype_ids
        self.scopes = scopes
        self.scope_ids = scope_ids
        self.bonds = bonds
        self.cell = Cell(rvecs)
        self.charges = charges
        self.radii = radii
        self.valence_charges = valence_charges
        self.dipoles = dipoles
        self.radii2 = radii2
        self.masses = masses
        with log.section('SYS'):
            # report some stuff
            self._init_log()
            # compute some derived attributes
            self._init_derived()

    def _init_log(self):
        if log.do_medium:
            log('Unit cell')
            log.hline()
            log('Number of periodic dimensions: %i' % self.cell.nvec)
            lengths, angles = self.cell.parameters
            names = 'abc'
            for i in range(len(lengths)):
                log('Cell parameter %5s: %10s' %
                    (names[i], log.length(lengths[i])))
            names = 'alpha', 'beta', 'gamma'
            for i in range(len(angles)):
                log('Cell parameter %5s: %10s' %
                    (names[i], log.angle(angles[i])))
            log.hline()
            log.blank()

    def _init_derived(self):
        if self.bonds is not None:
            self._init_derived_bonds()
        if self.scopes is not None:
            self._init_derived_scopes()
        elif self.scope_ids is not None:
            raise ValueError(
                'The scope_ids only make sense when the scopes argument is given.'
            )
        if self.ffatypes is not None:
            self._init_derived_ffatypes()
        elif self.ffatype_ids is not None:
            raise ValueError(
                'The ffatype_ids only make sense when the ffatypes argument is given.'
            )

    def _init_derived_bonds(self):
        # 1-bond neighbors
        self.neighs1 = dict((i, set([])) for i in range(self.natom))
        for i0, i1 in self.bonds:
            self.neighs1[i0].add(i1)
            self.neighs1[i1].add(i0)
        # 2-bond neighbors
        self.neighs2 = dict((i, set([])) for i in range(self.natom))
        for i0, n0 in self.neighs1.items():
            for i1 in n0:
                for i2 in self.neighs1[i1]:
                    # Require that there are no shorter paths than two bonds between
                    # i0 and i2. Also avoid duplicates.
                    if i2 > i0 and i2 not in self.neighs1[i0]:
                        self.neighs2[i0].add(i2)
                        self.neighs2[i2].add(i0)
        # 3-bond neighbors
        self.neighs3 = dict((i, set([])) for i in range(self.natom))
        for i0, n0 in self.neighs1.items():
            for i1 in n0:
                for i3 in self.neighs2[i1]:
                    # Require that there are no shorter paths than three bonds
                    # between i0 and i3. Also avoid duplicates.
                    if i3 != i0 and i3 not in self.neighs1[
                            i0] and i3 not in self.neighs2[i0]:
                        self.neighs3[i0].add(i3)
                        self.neighs3[i3].add(i0)
        # 4-bond neighbors
        self.neighs4 = dict((i, set([])) for i in range(self.natom))
        for i0, n0 in self.neighs1.items():
            for i1 in n0:
                for i4 in self.neighs3[i1]:
                    # Require that there are no shorter paths than three bonds
                    # between i0 and i4. Also avoid duplicates.
                    if i4 != i0 and i4 not in self.neighs1[
                            i0] and i4 not in self.neighs2[
                                i0] and i4 not in self.neighs3[i0]:
                        self.neighs4[i0].add(i4)
                        self.neighs4[i4].add(i0)
        # report some basic stuff on screen
        if log.do_medium:
            log('Analysis of the bonds:')
            bond_types = {}
            for i0, i1 in self.bonds:
                key = tuple(sorted([self.numbers[i0], self.numbers[i1]]))
                bond_types[key] = bond_types.get(key, 0) + 1
            log.hline()
            log(' First   Second   Count')
            for (num0, num1), count in sorted(bond_types.items()):
                log('%6i   %6i   %5i' % (num0, num1, count))
            log.hline()
            log.blank()

            log('Analysis of the neighbors:')
            log.hline()
            log('Number of first neighbors:  %6i' %
                (sum(len(n) for n in self.neighs1.values()) // 2))
            log('Number of second neighbors: %6i' %
                (sum(len(n) for n in self.neighs2.values()) // 2))
            log('Number of third neighbors:  %6i' %
                (sum(len(n) for n in self.neighs3.values()) // 2))
            # Collect all types of 'environments' for each element. This is
            # useful to double check the bonds
            envs = {}
            for i0 in range(self.natom):
                num0 = self.numbers[i0]
                nnums = tuple(
                    sorted(self.numbers[i1] for i1 in self.neighs1[i0]))
                key = (num0, nnums)
                envs[key] = envs.get(key, 0) + 1
            # Print the environments on screen
            log.hline()
            log('Element   Neighboring elements   Count')
            for (num0, nnums), count in sorted(envs.items()):
                log('%7i   %20s   %5i' %
                    (num0, ','.join(str(num1) for num1 in nnums), count))
            log.hline()
            log.blank()

    def _init_derived_scopes(self):
        if self.scope_ids is None:
            if len(self.scopes) != self.natom:
                raise TypeError(
                    'When the scope_ids are derived automatically, the length of the scopes list must match the number of atoms.'
                )
            lookup = {}
            scopes = []
            self.scope_ids = np.zeros(self.natom, int)
            for i in range(self.natom):
                scope = self.scopes[i]
                scope_id = lookup.get(scope)
                if scope_id is None:
                    scope_id = len(scopes)
                    scopes.append(scope)
                    lookup[scope] = scope_id
                self.scope_ids[i] = scope_id
            self.scopes = scopes
        for scope in self.scopes:
            check_name(scope)
        # check the range of the ids
        if self.scope_ids.min() != 0 or self.scope_ids.max() != len(
                self.scopes) - 1:
            raise ValueError('The ffatype_ids have incorrect bounds.')
        if log.do_medium:
            log('The following scopes are present in the system:')
            log.hline()
            log('                 Scope   ID   Number of atoms')
            log.hline()
            for scope_id, scope in enumerate(self.scopes):
                log('%22s  %3i       %3i' %
                    (scope, scope_id, (self.scope_ids == scope_id).sum()))
            log.hline()
            log.blank()

    def _init_derived_ffatypes(self):
        if self.ffatype_ids is None:
            if len(self.ffatypes) != self.natom:
                raise TypeError(
                    'When the ffatype_ids are derived automatically, the length of the ffatypes list must match the number of atoms.'
                )
            lookup = {}
            ffatypes = []
            self.ffatype_ids = np.zeros(self.natom, int)
            for i in range(self.natom):
                if self.scope_ids is None:
                    ffatype = self.ffatypes[i]
                    key = ffatype, None
                else:
                    scope_id = self.scope_ids[i]
                    ffatype = self.ffatypes[i]
                    key = ffatype, scope_id
                ffatype_id = lookup.get(key)
                if ffatype_id is None:
                    ffatype_id = len(ffatypes)
                    ffatypes.append(ffatype)
                    lookup[key] = ffatype_id
                self.ffatype_ids[i] = ffatype_id
            self.ffatypes = ffatypes
        for ffatype in self.ffatypes:
            check_name(ffatype)
        # check the range of the ids
        if self.ffatype_ids.min() != 0 or self.ffatype_ids.max() != len(
                self.ffatypes) - 1:
            raise ValueError('The ffatype_ids have incorrect bounds.')
        # differentiate ffatype_ids if the same ffatype_id is used in different
        # scopes
        if self.scopes is not None:
            self.ffatype_id_to_scope_id = {}
            fixed_fids = {}
            for i in range(self.natom):
                fid = self.ffatype_ids[i]
                sid = self.ffatype_id_to_scope_id.get(fid)
                if sid is None:
                    self.ffatype_id_to_scope_id[fid] = self.scope_ids[i]
                elif sid != self.scope_ids[i]:
                    # We found the same ffatype_id in a different scope_id. This
                    # must be fixed. First check if we have already a new
                    # scope_id ready
                    sid = self.scope_ids[i]
                    new_fid = fixed_fids.get((sid, fid))
                    if new_fid is None:
                        # No previous new fid create, do it now.
                        new_fid = len(self.ffatypes)
                        # Copy the ffatype label
                        self.ffatypes.append(self.ffatypes[fid])
                        # Keep track of the new fid
                        fixed_fids[(sid, fid)] = new_fid
                        if log.do_warning:
                            log.warn(
                                'Atoms with type ID %i in scope %s were changed to type ID %i.'
                                % (fid, self.scopes[sid], new_fid))
                    # Apply the new fid
                    self.ffatype_ids[i] = new_fid
                    self.ffatype_id_to_scope_id[new_fid] = sid
        # Turn the ffatypes in the scopes into array
        if self.ffatypes is not None:
            self.ffatypes = np.array(self.ffatypes, copy=False)
        if self.scopes is not None:
            self.scopes = np.array(self.scopes, copy=False)
        # check the range of the ids
        if self.ffatype_ids.min() != 0 or self.ffatype_ids.max() != len(
                self.ffatypes) - 1:
            raise ValueError('The ffatype_ids have incorrect bounds.')
        if log.do_medium:
            log('The following atom types are present in the system:')
            log.hline()
            if self.scopes is None:
                log('             Atom type   ID   Number of atoms')
                log.hline()
                for ffatype_id, ffatype in enumerate(self.ffatypes):
                    log('%22s  %3i       %3i' %
                        (ffatype, ffatype_id,
                         (self.ffatype_ids == ffatype_id).sum()))
            else:
                log('                 Scope              Atom type   ID   Number of atoms'
                    )
                log.hline()
                for ffatype_id, ffatype in enumerate(self.ffatypes):
                    scope = self.scopes[
                        self.ffatype_id_to_scope_id[ffatype_id]]
                    log('%22s %22s  %3i       %3i' %
                        (scope, ffatype, ffatype_id,
                         (self.ffatype_ids == ffatype_id).sum()))
            log.hline()
            log.blank()

    def _get_natom(self):
        """The number of atoms"""
        return len(self.pos)

    natom = property(_get_natom)

    def _get_nffatype(self):
        """The number of atom types"""
        return len(self.ffatypes)

    nffatype = property(_get_nffatype)

    def _get_nbond(self):
        '''The number of bonds'''
        if self.bonds is None:
            return 0
        else:
            return len(self.bonds)

    nbond = property(_get_nbond)

    @classmethod
    def from_file(cls, *fns, **user_kwargs):
        """Construct a new System instance from one or more files

           **Arguments:**

           fn1, fn2, ...
                A list of filenames that are read in order. Information in later
                files overrides information in earlier files.

           **Optional arguments:**

           Any argument from the default constructor ``__init__``. These must be
           given with keywords.

           **Supported file formats**

           .xyz
                Standard Cartesian coordinates file (in angstroms). Atomic
                positions and atomic numbers are read from this file. If the
                title consists of 3, 6 or 9 numbers, each group of three numbers
                is interpreted as a cell vector (in angstroms). A guess of the
                bonds will be made based on inter-atomic distances.

           .psf
                Atom types and bonds are read from this file

           .chk
                Internal text-based checkpoint format. It just contains a
                dictionary with the constructor arguments.
        """
        with log.section('SYS'):
            kwargs = {}
            for fn in fns:
                if fn.endswith('.xyz'):
                    from molmod import Molecule
                    mol = Molecule.from_file(fn)
                    kwargs['numbers'] = mol.numbers.copy()
                    kwargs['pos'] = mol.coordinates.copy()
                elif fn.endswith('.psf'):
                    from molmod.io import PSFFile
                    psf = PSFFile(fn)
                    kwargs['ffatypes'] = psf.atom_types
                    kwargs['bonds'] = np.array(psf.bonds, copy=False)
                    kwargs['charges'] = np.array(psf.charges, copy=False)
                elif fn.endswith('.chk'):
                    from molmod.io import load_chk
                    allowed_keys = [
                        'numbers',
                        'pos',
                        'scopes',
                        'scope_ids',
                        'ffatypes',
                        'ffatype_ids',
                        'bonds',
                        'rvecs',
                        'charges',
                        'radii',
                        'valence_charges',
                        'dipoles',
                        'radii2',
                        'masses',
                    ]
                    for key, value in load_chk(fn).items():
                        if key in allowed_keys:
                            kwargs.update({key: value})
                elif fn.endswith('.h5'):
                    with h5.File(fn, 'r') as f:
                        return cls.from_hdf5(f)
                else:
                    raise IOError('Can not read from file \'%s\'.' % fn)
                if log.do_high:
                    log('Read system parameters from %s.' % fn)
            kwargs.update(user_kwargs)
        return cls(**kwargs)

    @classmethod
    def from_hdf5(cls, f):
        '''Create a system from an HDF5 file/group containing a system group

           **Arguments:**

           f
                An open h5.File object with a system group. The system group
                must at least contain a numbers and pos dataset.
        '''
        sgrp = f['system']
        kwargs = {
            'numbers': sgrp['numbers'][:],
            'pos': sgrp['pos'][:],
        }
        for key in 'scope_ids', 'ffatype_ids', 'bonds', 'rvecs', 'charges', 'masses':
            if key in sgrp:
                kwargs[key] = sgrp[key][:]
        # String arrays have to be converted back to unicode...
        for key in 'scopes', 'ffatypes':
            if key in sgrp:
                kwargs[key] = np.asarray(sgrp[key][:], 'U22')
        if log.do_high:
            log('Read system parameters from %s.' % f.filename)
        return cls(**kwargs)

    def to_file(self, fn):
        """Write the system to a file

           **Arguments:**

           fn
                The file to write to.

           Supported formats are:

           chk
                Internal human-readable checkpoint format. This format includes
                all the information of a system object. All data are stored in
                atomic units.

           h5
                Internal binary checkpoint format. This format includes
                all the information of a system object. All data are stored in
                atomic units.

           xyz
                A simple file with atomic positions and elements. Coordinates
                are written in Angstroms.
        """
        if fn.endswith('.chk'):
            from molmod.io import dump_chk
            dump_chk(
                fn, {
                    'numbers': self.numbers,
                    'pos': self.pos,
                    'ffatypes': self.ffatypes,
                    'ffatype_ids': self.ffatype_ids,
                    'scopes': self.scopes,
                    'scope_ids': self.scope_ids,
                    'bonds': self.bonds,
                    'rvecs': self.cell.rvecs,
                    'charges': self.charges,
                    'radii': self.radii,
                    'valence_charges': self.valence_charges,
                    'dipoles': self.dipoles,
                    'radii2': self.radii2,
                    'masses': self.masses,
                })
        elif fn.endswith('.h5'):
            with h5.File(fn, 'w') as f:
                self.to_hdf5(f)
        elif fn.endswith('.xyz'):
            from molmod.io import XYZWriter
            from molmod.periodic import periodic
            xyz_writer = XYZWriter(fn,
                                   [periodic[n].symbol for n in self.numbers])
            xyz_writer.dump(str(self), self.pos)
        else:
            raise NotImplementedError(
                'The extension of %s does not correspond to any known format.'
                % fn)
        if log.do_high:
            with log.section('SYS'):
                log('Wrote system to %s.' % fn)

    def to_hdf5(self, f):
        """Write the system to a HDF5 file.

           **Arguments:**

           f
                A Writable h5.File object.
        """
        if 'system' in f:
            raise ValueError(
                'The HDF5 file already contains a system description.')
        sgrp = f.create_group('system')
        sgrp.create_dataset('numbers', data=self.numbers)
        sgrp.create_dataset('pos', data=self.pos)
        if self.scopes is not None:
            sgrp.create_dataset('scopes', data=np.asarray(self.scopes, 'S22'))
            sgrp.create_dataset('scope_ids', data=self.scope_ids)
        if self.ffatypes is not None:
            sgrp.create_dataset('ffatypes',
                                data=np.asarray(self.ffatypes, 'S22'))
            sgrp.create_dataset('ffatype_ids', data=self.ffatype_ids)
        if self.bonds is not None:
            sgrp.create_dataset('bonds', data=self.bonds)
        if self.cell.nvec > 0:
            sgrp.create_dataset('rvecs', data=self.cell.rvecs)
        if self.charges is not None:
            sgrp.create_dataset('charges', data=self.charges)
        if self.radii is not None:
            sgrp.create_dataset('radii', data=self.radii)
        if self.valence_charges is not None:
            sgrp.create_dataset('valence_charges', data=self.charges)
        if self.dipoles is not None:
            sgrp.create_dataset('dipoles', data=self.dipoles)
        if self.radii2 is not None:
            sgrp.create_dataset('radii2', data=self.radii2)
        if self.masses is not None:
            sgrp.create_dataset('masses', data=self.masses)

    def get_scope(self, index):
        """Return the of the scope (string) of atom with given index"""
        return self.scopes[self.scope_ids[index]]

    def get_ffatype(self, index):
        """Return the of the ffatype (string) of atom with given index"""
        return self.ffatypes[self.ffatype_ids[index]]

    def get_indexes(self, rule):
        """Return the atom indexes that match the filter ``rule``

           ``rule`` can be a function that accepts two arguments: system and an
           atom index and that returns True of the atom with index i is of a
           given type. On the other hand ``rule`` can be an ATSELECT string that
           defines the atoms of interest.

           A list of atom indexes is returned.
        """
        if isinstance(rule, str):
            rule = atsel_compile(rule)
        return np.array([i for i in range(self.natom) if rule(self, i)])

    def iter_bonds(self):
        """Iterate over all bonds."""
        if self.bonds is not None:
            for i1, i2 in self.bonds:
                yield i1, i2

    def iter_angles(self):
        """Iterative over all possible valence angles.

           This routine is based on the attribute ``bonds``.
        """
        if self.bonds is not None:
            for i1 in range(self.natom):
                for i0 in self.neighs1[i1]:
                    for i2 in self.neighs1[i1]:
                        if i0 > i2:
                            yield i0, i1, i2

    def iter_dihedrals(self):
        """Iterative over all possible dihedral angles.

           This routine is based on the attribute ``bonds``.
        """
        if self.bonds is not None:
            for i1, i2 in self.bonds:
                for i0 in self.neighs1[i1]:
                    if i0 == i2: continue
                    for i3 in self.neighs1[i2]:
                        if i1 == i3: continue
                        if i0 == i3: continue
                        yield i0, i1, i2, i3

    def iter_oops(self):
        """Iterative over all possible oop patterns."

           This routine is based on the attribute ``bonds``.
        """
        if self.bonds is not None:
            for i3 in range(self.natom):
                if len(self.neighs1[i3]) == 3:
                    i0, i1, i2 = self.neighs1[i3]
                    yield i0, i1, i2, i3

    def detect_bonds(self, exceptions=None):
        """Initialize the ``bonds`` attribute based on inter-atomic distances

           **Optional argument:**

           exceptions:
                Specify custom threshold for certain pairs of elements. This
                must be a dictionary with ((num0, num1), threshold) as items.

           For each pair of elements, a distance threshold is used to detect
           bonded atoms. The distance threshold is based on a database of known
           bond lengths. If the database does not contain a record for the given
           element pair, the threshold is based on the sum of covalent radii.
        """
        with log.section('SYS'):
            from molmod.bonds import bonds
            if self.bonds is not None:
                if log.do_warning:
                    log.warn('Overwriting existing bonds.')
            work = np.zeros((self.natom * (self.natom - 1)) // 2, float)
            self.cell.compute_distances(work, self.pos)
            ishort = (work < bonds.max_length * 1.01).nonzero()[0]
            new_bonds = []
            for i in ishort:
                i0, i1 = _unravel_triangular(i)
                n0 = self.numbers[i0]
                n1 = self.numbers[i1]
                if exceptions is not None:
                    threshold = exceptions.get((n0, n1))
                    if threshold is None and n0 != n1:
                        threshold = exceptions.get((n1, n0))
                    if threshold is not None:
                        if work[i] < threshold:
                            new_bonds.append([i0, i1])
                        continue
                if bonds.bonded(n0, n1, work[i]):
                    new_bonds.append([i0, i1])
            self.bonds = np.array(new_bonds)
            self._init_derived_bonds()

    def detect_ffatypes(self, rules):
        """Initialize the ``ffatypes`` attribute based on ATSELECT rules.

           **Argument:**

           rules
                A list of (ffatype, rule) pairs that will be used to initialize
                the attributes ``self.ffatypes`` and ``self.ffatype_ids``.

           If the system already has FF atom types, they will be overwritten.
        """
        with log.section('SYS'):
            # Give warning if needed
            if self.ffatypes is not None:
                if log.do_warning:
                    log.warn('Overwriting existing FF atom types.')
            # Compile all the rules
            my_rules = []
            for ffatype, rule in rules:
                check_name(ffatype)
                if isinstance(rule, str):
                    rule = atsel_compile(rule)
                my_rules.append((ffatype, rule))
            # Use the rules to detect the atom types
            lookup = {}
            self.ffatypes = []
            self.ffatype_ids = np.zeros(self.natom, int)
            for i in range(self.natom):
                my_ffatype = None
                for ffatype, rule in my_rules:
                    if rule(self, i):
                        my_ffatype = ffatype
                        break
                if my_ffatype is None:
                    raise ValueError(
                        'Could not detect FF atom type of atom %i.' % i)
                ffatype_id = lookup.get(my_ffatype)
                if ffatype_id is None:
                    ffatype_id = len(lookup)
                    self.ffatypes.append(my_ffatype)
                    lookup[my_ffatype] = ffatype_id
                self.ffatype_ids[i] = ffatype_id
            # Make sure all is done well ...
            self._init_derived_ffatypes()

    def set_standard_masses(self):
        """Initialize the ``masses`` attribute based on the atomic numbers."""
        with log.section('SYS'):
            from molmod.periodic import periodic
            if self.masses is not None:
                if log.do_warning:
                    log.warn(
                        'Overwriting existing masses with default masses.')
            self.masses = np.array([periodic[n].mass for n in self.numbers])

    def align_cell(self, lcs=None, swap=True):
        """Align the unit cell with respect to the Cartesian Axes frame

           **Optional Arguments:**

           lcs
                The linear combinations of the unit cell that must get aligned.
                This is a 2x3 array, where each row represents a linear
                combination of cell vectors. The first row is for alignment with
                the x-axis, second for the z-axis. The default value is::

                    np.array([
                        [1, 0, 0],
                        [0, 0, 1],
                    ])

           swap
                By default, the first alignment is done with the z-axis, then
                with the x-axis. The order is reversed when swap is set to
                False.

           The alignment of the first linear combination is always perfect. The
           alignment of the second linear combination is restricted to a plane.
           The cell is always made right-handed. The coordinates are also
           rotated with respect to the origin, but never inverted.

           The attributes of the system are modified in-place. Note that this
           method only works on 3D periodic systems.
        """
        from molmod import Rotation, deg
        # define the target
        target = np.array([
            [1, 0, 0],
            [0, 0, 1],
        ])

        # default value for linear combination
        if lcs is None:
            lcs = target.copy()

        # The starting values
        pos = self.pos
        rvecs = self.cell.rvecs.copy()
        if rvecs.shape != (3, 3):
            raise TypeError(
                'The align_cell method only supports 3D periodic systems.')

        # Optionally swap a cell vector if the cell is not right-handed.
        if np.linalg.det(rvecs) < 0:
            # Find a reasonable vector to swap...
            index = rvecs.sum(axis=1).argmin()
            rvecs[index] *= -1

        # Define the source
        source = np.dot(lcs, rvecs)

        # Do the swapping
        if swap:
            target = target[::-1]
            source = source[::-1]

        # auxiliary function
        def get_angle_axis(t, s):
            cos = np.dot(s, t) / np.linalg.norm(s) / np.linalg.norm(t)
            angle = np.arccos(np.clip(cos, -1, 1))
            axis = np.cross(s, t)
            return angle, axis

        # first alignment
        angle, axis = get_angle_axis(target[0], source[0])
        if np.linalg.norm(axis) > 0:
            r1 = Rotation.from_properties(angle, axis, False)
            pos = r1 * pos
            rvecs = r1 * rvecs
            source = r1 * source

        # second alignment
        # Make sure the source is orthogonal to target[0]
        s1p = source[1] - target[0] * np.dot(target[0], source[1])
        angle, axis = get_angle_axis(target[1], s1p)
        r2 = Rotation.from_properties(angle, axis, False)
        pos = r2 * pos
        rvecs = r2 * rvecs

        # assign
        self.pos = pos
        self.cell = Cell(rvecs)

    def supercell(self, *reps):
        """Return a supercell of the system.

           **Arguments:**

           reps
                An array with repetitions, which must have the same number of
                elements as the number of cell vectors.

           If this method is called with a non-periodic system, a TypeError is
           raised.
        """
        if self.cell.nvec == 0:
            raise TypeError(
                'Can not create a supercell of a non-periodic system.')
        if self.cell.nvec != len(reps):
            raise TypeError(
                'The number of repetitions must match the number of cell vectors.'
            )
        if not isinstance(reps, tuple):
            raise TypeError('The reps argument must be a tuple')
        # A dictionary with new arguments for the construction of the supercell
        new_args = {}

        # A) No repetitions
        if self.ffatypes is not None:
            new_args['ffatypes'] = self.ffatypes.copy()
        if self.scopes is not None:
            new_args['scopes'] = self.scopes.copy()

        # B) Simple repetitions
        rep_all = np.product(reps)
        for attrname in 'numbers', 'ffatype_ids', 'scope_ids', 'charges', \
                        'radii', 'valence_charges', 'radii2', 'masses':
            value = getattr(self, attrname)
            if value is not None:
                new_args[attrname] = np.tile(value, rep_all)
        attrname = 'dipoles'
        value = getattr(self, attrname)
        if value is not None:
            new_args[attrname] = np.tile(value, (rep_all, 1))

        # C) Cell vectors
        new_args['rvecs'] = self.cell.rvecs * np.array(reps)[:, None]

        # D) Atom positions
        new_pos = np.zeros((self.natom * rep_all, 3), float)
        start = 0
        for iimage in np.ndindex(reps):
            stop = start + self.natom
            new_pos[start:stop] = self.pos + np.dot(iimage, self.cell.rvecs)
            start = stop
        new_args['pos'] = new_pos

        if self.bonds is not None:
            # E) Bonds
            # E.1) A function that translates a set of image indexes and an old atom
            # index into a new atom index
            offsets = {}
            start = 0
            for iimage in np.ndindex(reps):
                offsets[iimage] = start
                start += self.natom

            def to_new_atom_index(iimage, i):
                return offsets[iimage] + i

            # E.2) Construct extended bond information: for each bond, also keep
            # track of periodic image it connects to. Note that this information
            # is implicit in yaff, and derived using the minimum image convention.
            rel_iimage = {}
            for ibond in range(len(self.bonds)):
                i0, i1 = self.bonds[ibond]
                delta = self.pos[i0] - self.pos[i1]
                frac = np.dot(self.cell.gvecs, delta)
                rel_iimage[ibond] = np.ceil(frac - 0.5)

            # E.3) Create the new bonds
            new_bonds = np.zeros((len(self.bonds) * rep_all, 2), int)
            counter = 0
            for iimage0 in np.ndindex(reps):
                for ibond in range(len(self.bonds)):
                    i0, i1 = self.bonds[ibond]
                    # Translate i0 to the new index.
                    j0 = to_new_atom_index(iimage0, i0)
                    # Also translate i1 to the new index. This is a bit more tricky.
                    # The difficult case occurs when the bond between i0 and i1
                    # connects different periodic images. In that case, the change
                    # in periodic image must be taken into account.
                    iimage1 = tuple(
                        (iimage0[c] + rel_iimage[ibond][c]) % reps[c]
                        for c in range(len(reps)))
                    j1 = to_new_atom_index(iimage1, i1)
                    new_bonds[counter, 0] = j0
                    new_bonds[counter, 1] = j1
                    counter += 1
            new_args['bonds'] = new_bonds

        # Done
        return System(**new_args)

    def remove_duplicate(self, threshold=0.1):
        '''Return a system object in which the duplicate atoms and bonds are removed.

           **Optional argument:**

           threshold
                The minimum distance between two atoms that are supposed to be
                different.

           When it makes sense, properties of overlapping atoms are averaged
           out. In other cases, the atom with the lowest index in a cluster of
           overlapping atoms defines the new value of a property.
        '''
        # compute distances
        ndist = (self.natom * (self.natom - 1)) // 2
        if ndist == 0:  # single atom systems, go home ...
            return
        dists = np.zeros(ndist)
        self.cell.compute_distances(dists, self.pos)

        # find clusters of overlapping atoms
        from molmod import ClusterFactory
        cf = ClusterFactory()
        counter = 0
        for i0 in range(self.natom):
            for i1 in range(i0):
                if dists[counter] < threshold:
                    cf.add_related(i0, i1)
                counter += 1
        clusters = [c.items for c in cf.get_clusters()]

        # make a mapping from new to old atoms
        newold = {}
        oldnew = {}
        counter = 0
        for cluster in clusters:  # all merged atoms come first
            newold[counter] = sorted(cluster)
            for item in cluster:
                oldnew[item] = counter
            counter += 1
        if len(clusters) > 0:
            old_reduced = set.union(*clusters)
        else:
            old_reduced = []
        for item in range(self.natom):  # all remaining atoms follow
            if item not in old_reduced:
                newold[counter] = [item]
                oldnew[item] = counter
                counter += 1
        natom = len(newold)

        def reduce_int_array(old):
            if old is None:
                return None
            else:
                new = np.zeros(natom, old.dtype)
                for inew, iolds in newold.items():
                    new[inew] = old[iolds[0]]
                return new

        def reduce_float_array(old):
            if old is None:
                return None
            else:
                new = np.zeros(natom, old.dtype)
                for inew, iolds in newold.items():
                    new[inew] = old[iolds].mean()
                return new

        def reduce_float_matrix(old):
            '''Reduce array with dim=2'''
            if old is None:
                return None
            else:
                new = np.zeros((natom, np.shape(old)[1]), old.dtype)
                for inew, iolds in newold.items():
                    new[inew] = old[iolds].mean(axis=0)
                return new

        # trivial cases
        numbers = reduce_int_array(self.numbers)
        scope_ids = reduce_int_array(self.scope_ids)
        ffatype_ids = reduce_int_array(self.ffatype_ids)
        charges = reduce_float_array(self.charges)
        radii = reduce_float_array(self.radii)
        valence_charges = reduce_float_array(self.valence_charges)
        dipoles = reduce_float_matrix(self.dipoles)
        radii2 = reduce_float_array(self.radii2)
        masses = reduce_float_array(self.masses)

        # create averaged positions
        pos = np.zeros((natom, 3), float)
        for inew, iolds in newold.items():
            # move to the same image
            oldposs = self.pos[iolds].copy()
            assert oldposs.ndim == 2
            ref = oldposs[0]
            for oldpos in oldposs[1:]:
                delta = oldpos - ref
                self.cell.mic(delta)
                oldpos[:] = delta + ref
            # compute mean position
            pos[inew] = oldposs.mean(axis=0)

        # create reduced list of bonds
        if self.bonds is None:
            bonds = None
        else:
            bonds = set((oldnew[ia], oldnew[ib]) for ia, ib in self.bonds)
            bonds = np.array([bond for bond in bonds])

        return self.__class__(numbers, pos, self.scopes, scope_ids,
                              self.ffatypes, ffatype_ids, bonds,
                              self.cell.rvecs, charges, radii, valence_charges,
                              dipoles, radii2, masses)

    def subsystem(self, indexes):
        '''Return a System instance in which only the given atom are retained.'''
        def reduce_array(old):
            if old is None:
                return None
            else:
                new = np.zeros((len(indexes), ) + old.shape[1:], old.dtype)
                for inew, iold in enumerate(indexes):
                    new[inew] = old[iold]
                return new

        def reduce_scopes():
            if self.scopes is None:
                return None
            else:
                return [self.get_scope(i) for i in indexes]

        def reduce_ffatypes():
            if self.ffatypes is None:
                return None
            else:
                return [self.get_ffatype(i) for i in indexes]

        def reduce_bonds(old):
            translation = dict(
                (iold, inew) for inew, iold in enumerate(indexes))
            new = []
            for old0, old1 in old:
                new0 = translation.get(old0)
                new1 = translation.get(old1)
                if not (new0 is None or new1 is None):
                    new.append([new0, new1])
            return new

        return System(
            numbers=reduce_array(self.numbers),
            pos=reduce_array(self.pos),
            scopes=reduce_scopes(),
            ffatypes=reduce_ffatypes(),
            bonds=reduce_bonds(self.bonds),
            rvecs=self.cell.rvecs,
            charges=reduce_array(self.charges),
            radii=reduce_array(self.radii),
            valence_charges=reduce_array(self.valence_charges),
            dipoles=reduce_array(self.dipoles),
            radii2=reduce_array(self.radii2),
            masses=reduce_array(self.masses),
        )

    def cut_bonds(self, indexes):
        '''Remove all bonds of a fragment with the remainder of the system;

           **Arguments:**

           indexes
                The atom indexes in the fragment
        '''
        new_bonds = []
        indexes = set(indexes)
        for i0, i1 in self.bonds:
            if not ((i0 in indexes) ^ (i1 in indexes)):
                new_bonds.append([i0, i1])
        self.bonds = np.array(new_bonds)

    def iter_matches(self, other, overlapping=True):
        """Yield all renumberings of atoms that map the given system on the current.

        Parameters
        ----------
        other : yaff.System
            Another system with the same number of atoms (and chemical formula), or less
            atoms.
        overlapping : bool
            When set to False, the returned matches are guaranteed to be mutually
            exclusive. The result may not be unique when partially overlapping matches
            would exist. Use with care.

        The graph distance is used to perform the mapping, so bonds must be defined in
        the current and the given system.
        """
        def make_graph_distance_matrix(system):
            """Return a bond graph distance matrix.

            Parameters
            ----------
            system : System
                Molecule (with bonds) for which the graph distances must be computed.

            The graph distance is used for comparison because it allows the pattern
            matching to make optimal choices of which pairs of atoms to compare next, i.e.
            both bonded or nearby the last matched pair.
            """
            from molmod.graphs import Graph
            return Graph(system.bonds, system.natom).distances

        def error_sq_fn(x, y):
            """Compare bonded versus not bonded, rather than the full graph distance.

            Parameters
            ----------
            x, y: int
                Graph distances from self and other, respectively.

            Graph distances are not completely transferable between self and other, i.e. a
            shorter path may exist between two atoms in the big system (self) that is not
            present in a fragment (other). Hence, only the absence or presence of a direct
            bond must be compared.
            """
            return (min(x - 1, 1) - min(y - 1, 1))**2

        with log.section('SYS'):
            log('Generating allowed indexes for renumbering.')
            # The allowed permutations is just based on the chemical elements, not the atom
            # types, which could also be useful.
            allowed = []
            if self.ffatypes is None or other.ffatypes is None:
                for number1 in other.numbers:
                    allowed.append((self.numbers == number1).nonzero()[0])
            else:
                # Only continue if other.ffatypes is a subset of self.ffatypes
                if not (set(self.ffatypes) >= set(other.ffatypes)):
                    return
                ffatype_ids0 = self.ffatype_ids
                ffatypes0 = list(self.ffatypes)
                order = np.array(
                    [ffatypes0.index(ffatype) for ffatype in other.ffatypes])
                ffatype_ids1 = order[other.ffatype_ids]
                for ffatype_id1 in ffatype_ids1:
                    allowed.append((ffatype_ids0 == ffatype_id1).nonzero()[0])
            log('Building distance matrix for self.')
            dm0 = make_graph_distance_matrix(self)
            log('Building distance matrix for other.')
            dm1 = make_graph_distance_matrix(other)
            # Yield the solutions
            log('Generating renumberings.')
            for match in iter_matches(dm0, dm1, allowed, 1e-3, error_sq_fn,
                                      overlapping):
                yield match