def __init__(self, samples, engine): self.samples = samples self.engine = engine m = self.m = len(samples) if mics.verbose: # np.set_printoptions(precision=4, threshold=15, edgeitems=4, suppress=True) info("\n=== Setting up mixture ===") info("Analysis method: ", self.engine.__class__.__name__) info("Number of samples:", m) if m == 0: raise InputError("list of samples is empty") self.n = np.array([len(sample.dataset) for sample in samples]) self.neff = np.array([sample.neff for sample in samples]) names = self.names = list(samples[0].dataset.columns) if mics.verbose: info("Sample sizes:", self.n) info("Effective sample sizes:", self.neff) info("Properties:", ", ".join(names)) potentials = [sample.potential.lambdify() for sample in samples] self.u = [multimap(potentials, sample.dataset) for sample in samples] self.f = bennett(self.u) mics.verbose and info("Initial free-energy guess:", self.f) self.engine.__initialize__(self)
def histograms(self, property='potential', bins=100, **constants): """ """ if property == 'potential': y = [multimap([sample.potential.lambdify()], sample.dataset) for sample in self] else: names = list(self[0].dataset.columns) function = [func(property, names, constants).lambdify()] y = [multimap(function, sample.dataset) for sample in self] ymin = min([np.amin(x) for x in y]) ymax = max([np.amax(x) for x in y]) delta = (ymax - ymin)/bins center = [ymin + delta*(i + 0.5) for i in range(bins)] frame = pd.DataFrame({property: center}) for i in range(len(self)): frame["state %s" % (i+1)] = np.histogram(y[i], bins, (ymin, ymax))[0] return frame
def __compute__(self, functions, constants): try: if isinstance(functions, str): funcs = [func(functions, self.names, constants).lambdify()] else: funcs = [ func(f, self.names, constants).lambdify() for f in functions ] return [multimap(funcs, sample.dataset) for sample in self.samples] except (InputError, KeyError): return None
def averaging(self, properties, combinations={}, **constants): """ Computes averages and uncertainties of configurational properties. In addition, computes combinations among these averages while automatically handling uncertainty propagation. Parameters ---------- properties : dict(str: str) A dictionary associating names to mathematical expressions. This is used to define functions of the collective variables included in the samples. Then, averages of these functions will be evaluated at all sampled states, along with their uncertainties. The expressions might also depend on parameters passed as keyword arguments (see below). combinations : dict(str: str), optional, default={} A dictionary associating names to mathematical expressions. This is used to define functions of the names passed as keys in the `properties` dictionary. The expressions might also depend on parameters passed as keyword arguments (see below). **constants : optional keyword arguments A set of arguments passed as ``name=value``, used to define parameter values for evaluating the mathematical expressions in both `properties` and `combinations`. Returns ------- pandas.DataFrame A data frame containing the computed averages and combinations, as well as their estimated standard errors. """ variables = self.dataset.columns.tolist() functions = [ func(f, variables, constants).lambdify() for f in properties.values() ] y = multimap(functions, self.dataset) ym = np.mean(y, axis=1) Theta = covariance(y, ym, self.b) result = propertyDict(properties.keys(), ym, stdError(Theta)) if combinations: delta = deltaMethod(combinations.values(), properties.keys(), constants) (h, dh) = delta.evaluate(ym, Theta) result.update(propertyDict(combinations.keys(), h, dh)) return result.to_frame(0)
def subsampling(self, integratedACF=True): """ Performs inline subsampling based on the statistical inefficiency ``g`` of the specified attribute `acfun` of :class:`sample`, aiming at obtaining a sample of :term:`IID` configurations. Subsampling is done via jumps of varying sizes around ``g``, so that the sample size decays by a factor of approximately ``1/g``. Parameters ---------- integratedACF : bool, optional, default=True If true, the integrated :term:`ACF` method :cite:`Chodera_2007` will be used for computing the statistical inefficiency. Otherwise, the :term:`OBM` method will be used instead. Returns ------- :class:`sample` Although the subsampling is done inline, the new sample is returned for chaining purposes. """ n = len(self.dataset) if mics.verbose: info("\n=== Subsampling via %s ===" % ("integrated ACF" if integratedACF else "OBM")) info("Original sample size:", n) if integratedACF: y = multimap([self.acfun.lambdify()], self.dataset) g = timeseries.statisticalInefficiency(y[0]) else: g = n / self.neff new = timeseries.subsampleCorrelatedData(self.dataset.index, g) self.dataset = self.dataset.reindex(new) self.neff = len(new) if mics.verbose: info("Statistical inefficiency:", g) info("New sample size:", self.neff) return self
def __init__(self, dataset, potential, acfun=None, batchsize=None, **constants): names = dataset.columns.tolist() n = len(dataset) b = self.b = batchsize if batchsize else int(np.sqrt(n)) if mics.verbose: info("\n=== Setting up new sample ===") info("Properties:", ", ".join(names)) info("Constants:", constants) info("Reduced potential function:", potential) info("Autocorrelation analysis function:", acfun if acfun else potential) info("Sample size:", n) info("Batch size:", b) self.dataset = dataset self.potential = func(potential, names, constants) self.acfun = self.potential if acfun is None else func( acfun, names, constants) y = multimap([self.acfun.lambdify()], dataset) ym = np.mean(y, axis=1) S1 = covariance(y, ym, 1).item(0) Sb = covariance(y, ym, b).item(0) if not (np.isfinite(S1) and np.isfinite(Sb)): raise FloatingPointError( "unable to determine effective sample size") self.neff = n * S1 / Sb if mics.verbose: info("Variance disregarding autocorrelation:", S1) info("Variance via Overlapping Batch Means:", Sb) info("Effective sample size:", self.neff)