Beispiel #1
0
    def test_import_from_file(self, mock_open, mock_load, mock_isfile):
        """Test process-service file import"""

        # set return values
        mock_isfile.return_value = True
        mock_file = mock.MagicMock(name='service_file')
        mock_file.__enter__.return_value = mock_file
        mock_open.return_value = mock_file

        # create mock process-service class and instance
        logger = Logger()
        ps_cls = type('ps_cls', (), {'persist': True, 'logger': logger})
        ps = mock.Mock(name='ProcessService_instance')
        ps.__class__ = ps_cls

        # test normal import
        mock_load.return_value = ps
        ps_ = ProcessService.import_from_file.__func__(ps_cls, 'mock_file_path')
        self.assertIs(ps_, ps, 'unexpected process-service instance returned')
        mock_open.assert_called_once_with('mock_file_path', 'rb')
        mock_load.assert_called_once_with(mock_file)
        mock_open.reset_mock()
        mock_load.reset_mock()

        # test importing instance of incorrect type
        mock_load.return_value = None
        with self.assertRaises(TypeError):
            ProcessService.import_from_file.__func__(ps_cls, 'mock_file_path')
        mock_open.reset_mock()
        mock_load.reset_mock()

        # test import with non-persisting service
        ps_cls.persist = False
        ps_ = ProcessService.import_from_file.__func__(ps_cls, 'mock_file_path')
        self.assertIs(ps_, None, 'unexpected return value for non-persisting service')
Beispiel #2
0
class TimePeriod(ArgumentsMixin):
    """Time period."""

    logger = Logger()

    def __init__(self, **kwargs):
        """Initialize TimePeriod instance."""
        pass

    def period_index(self, dt):
        """Get number of periods until date/time "dt".

        :param dt: specified date/time
        """
        self.logger.fatal(
            'period_index method not implemented for {cls}; please implement derived class.',
            cls=self.__class__.__name__)
        raise NotImplementedError('period_index function is not implemented.')

    @classmethod
    def parse_time_period(cls, period):
        """Try to parse specified time period.

        :param period: specified period
        """
        # catch single value
        if not isinstance(period, dict):
            period = dict(value=period)

        # try to parse specified period
        try:
            return pd.Timedelta(**period).delta
        except Exception as ex:
            cls.logger.fatal('Unable to parse period: {period!s}.',
                             period=period)
            raise ex

    @classmethod
    def parse_date_time(cls, dt):
        """Try to parse specified date/time.

        :param dt: specified date/time
        """
        try:
            return pd.Timestamp(dt).value
        except Exception as ex:
            cls.logger.fatal('Unable to parse date/time: {dt!s}', dt=dt)
            raise ex
Beispiel #3
0
from eskapade import process_manager
from eskapade.logger import Logger
from eskapade.root_analysis import RooFitManager, TruncExpGen, TruncExpFit
from eskapade.root_analysis.roofit_models import TruncExponential

MODEL_NAME = 'voucher_redeem'
REDEEM_DATA_KEY = 'voucher_redeems'
AGE_DATA_KEY = 'voucher_ages'

MAX_AGE = 1500  # days
FAST_REDEEM_RATE = -0.01  # per day
SLOW_REDEEM_RATE = -0.001  # per day
FAST_FRAC = 0.4
REDEEM_FRAC = 0.6

logger = Logger()

logger.debug('Now parsing configuration file esk409_unredeemed_vouchers.')

###############################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk409_unredeemed_vouchers'
settings['version'] = 0

###############################################################################
# --- create voucher redeem model

# create model if it is not read from persisted services of first chain
if not settings.get('beginWithChain'):
    with a default plotter link.

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""

from eskapade import ConfigObject, Chain
from eskapade import core_ops, visualization, root_analysis
from eskapade import process_manager
from eskapade.logger import Logger

logger = Logger('macro.esk404_workspace_createpdf_simulate_fit_plot')

logger.debug('Now parsing configuration file esk404_workspace_createpdf_simulate_fit_plot')

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['generate_fit_plot'] = True
settings['summary'] = True
Beispiel #5
0
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""

import datetime
import os
import shutil
import sys

from eskapade.logger import Logger
from eskapade import resources

logger = Logger(__name__)


def get_absolute_path(path):
    """Get an absolute path.

    First expands ~ if present. Second take care of any . or ..

    :param path: path
    :returns: the absolute path
    """
    return os.path.abspath(os.path.expanduser(path))


def create_dir(path):
    """Create a leaf directory and all intermediate ones.
Beispiel #6
0
class Histogram(BinningUtil, ArgumentsMixin):
    """Generic 1D Histogram class.

    Histogram holds bin labels (name of each bin), value_counts (values of
    the histogram) and a variable name.  The bins can be categoric or
    numeric, where numeric includes timestamps.  In case of numeric bins,
    bin_specs is set.  bins_specs is a dict containing bin_width and
    bin_offset.  In case bins widths are not equal, bin_specs contains
    bin_edges instead of bin_width and bin_offset.
    """

    logger = Logger()

    def __init__(self, counts, **kwargs):
        """Initialize Histogram instance.

        A bin_specs dictionary can be provided as input. bins_specs is a dict
        containing 'bin_width' and 'bin_offset' keys.  In case bins widths are
        not equal, bin_specs contains 'bin_edges' (array) instead of 'bin_width'
        and 'bin_offset'.  'bin_width' and 'bin_offset' can be numeric or numpy
        timestamps.

        Histogram counts can be specified as a ValueCounts object, a dictionary
        or a tuple:

        * tuple: Histogram((bin_values, bin_edges), variable=<your_variable_name>)
        * dict: a dictionary as comes out of pandas.series.value_counts()
          or pandas.Dataframe.groupby.size() over one variable.
        * ValueCounts: a ValueCounts object contains a value_counts dictionary.

        Example bin_specs dictionaries are:

        >>> bin_specs = { 'bin_width': 1, 'bin_offset': 0 }
        >>> bin_spect = { 'bin_edges': [0,2,3,4,5,7,8] }
        >>> bin_specs = { 'bin_width': np.timedelta64(30,'D'),
                          'bin_offset': np.datetime64('2010-01-04') }

        :param counts: histogram counts
        :param dict bin_specs: dictionary contains 'bin_width' and 'bin_offset' numbers or 'bin_edges' array
                               (default is None)
        :param str variable: name of the variable represented by the histogram
        :param type datatype: data type of the variable represented by the histogram (optional)
        """
        # initialize Binning, pass bin_specs from kwargs
        BinningUtil.__init__(self, bin_specs=kwargs.pop('bin_specs', None))

        # parse arguments
        self._process_kwargs(kwargs, variable='', datatype=None)
        self.check_arg_vals('variable')
        self.check_arg_types(variable=str, bin_specs=dict)

        # check for extraneous keyword arguments
        self.check_extra_kwargs(kwargs)

        self.logger.debug('Initializing histogram for "{variable}".',
                          variable=self.variable)

        # set value counts
        if isinstance(counts, ValueCounts):
            # from ValueCounts object
            self._from_value_counts(counts)
        elif isinstance(counts, dict):
            # from value-counts dictionary
            self._from_dict(counts)
        else:
            # try from NumPy-style histogram
            try:
                counts, var_vals = list(counts[0]), list(counts[1])
            except Exception:
                self.logger.fatal(
                    'Invalid type for specified counts: "{type}".',
                    type=type(counts).__name__)
                raise RuntimeError('Invalid type for specified counts.')
            self._from_numpy(counts, var_vals)

        # check counts
        if self._val_counts.num_nonone_bins < 1:
            self.logger.warning('No bin counts specified for "{variable}".',
                                variable=self.variable)

        # check bin specifications
        if self.bin_specs:
            if 'bin_width' in self.bin_specs:
                try:
                    _check_num_vals(self.get_bin_labels())
                except TypeError as exc:
                    self.logger.fatal(
                        'Non-numeric values in bin labels not allowed if bin width is specified: {labels}.',
                        labels=self.get_bin_labels())
                    raise exc
            elif 'bin_edges' in self.bin_specs:
                edges = self.bin_specs['bin_edges']
                if len(edges) != self._val_counts.num_nonone_bins + 1:
                    self.logger.fatal(
                        'Number of specified bin edges ({n_edges:d}) does not match number of bins ({n_bins:d}).',
                        n_edges=len(edges),
                        n_bins=self._val_counts.num_nonone_bins)
                    raise RuntimeError(
                        'numbers of bins and bin edges do not match')
                try:
                    _check_num_vals(edges)
                except TypeError as exc:
                    self.logger.fatal(
                        'Non-numeric values found in bin edges: {edges}.',
                        edges=edges)
                    raise exc
                if any(e2 <= e1 for e1, e2 in zip(edges[:-1], edges[1:])):
                    self.logger.fatal(
                        'Values of bin edges are not increasing: {edges!s}',
                        edges=edges)
                    raise RuntimeError('Invalid bin edges specified.')

    @property
    def variable(self):
        """Name of variable represented by the histogram.

        :returns: variable name
        :rtype: string
        """
        return self._variable

    @variable.setter
    def variable(self, var):
        """Set name of variable represented by the histogram.

        :param str var: name of variable represented by the histogram.
        :raises RunTimeError: If a variable name already exists, it will not overwritten.
        """
        if hasattr(self, '_variable'):
            raise RuntimeError('histogram variable already set')
        self._variable = str(var)

    @property
    def datatype(self):  # noqa
        """Data type of the variable represented by the histogram.

        :returns: data type
        :rtype: type
        """
        return self._datatype

    @datatype.setter
    def datatype(self, dt):
        """Set data type of the variable represented by the histogram.

        :param type dt: type of the variable represented by the histogram
        :raises RunTimeError: If datatype has already been set, it will not overwritten
        """
        if hasattr(self, '_datatype'):
            raise RuntimeError('datatype already set')
        self._datatype = dt

    @property
    def n_dim(self):  # noqa
        """Number of histogram dimensions.

        The number of histogram dimensions, which is equal to one by
        construction.

        :returns: number of dimensions
        :rtype: int
        """
        return 1

    @property
    def n_bins(self):  # noqa
        """Number of bins in the ValueCounts object.

        :returns: number of bins
        :rtype: int
        """
        return self._val_counts.num_bins

    @property
    def num_bins(self):  # noqa
        """Number of bins.

        :returns: number of bins
        :rtype: int
        """
        return self._val_counts.num_bins

    def get_bin_labels(self):
        """Return all bin labels.

        :returns: array of all bin labels
        :rtype: array
        """
        return [v[0] for v in self._val_counts.nononecounts.keys()]

    def bin_labels(self):
        """Return bin labels.

        :returns: array of all bin labels
        :rtype: array
        """
        return np.array(self.get_bin_labels())

    def get_uniform_bin_edges(self):
        """Return numpy style bin-edges array with uniform binning.

        :returns: array of all bin edges
        :rtype: array
        """
        if not self.bin_specs:
            return None
        bin_range = list(self.get_nonone_bin_range())
        return self.truncated_bin_edges(variable_range=bin_range)

    def get_nonone_bin_edges(self):
        """Return numpy style bin-edges array.

        :returns: array of the bin edges
        :rtype: array
        """
        if not self.bin_specs:
            return None
        if 'bin_edges' in self.bin_specs:
            bin_edges = self.bin_specs['bin_edges']
        else:
            width = self.bin_specs['bin_width']
            offset = self.bin_specs.get('bin_offset', 0)
            edges = [
                offset + v[0] * width
                for v in self._val_counts.nononecounts.keys()
            ]
            bin_edges = edges + [edges[-1] + width]
        return bin_edges

    def bin_edges(self):
        """Return numpy style bin_edges array with uniform binning.

        :returns: array of all bin edges
        :rtype: array
        """
        return np.array(self.get_uniform_bin_edges())

    def get_nonone_bin_centers(self):
        """Return bin centers.

        :returns: array of the bin centers
        :rtype: array
        """
        if not self.bin_specs or len(self.bin_specs) == 0:
            return self.get_bin_labels()
        if 'bin_edges' in self.bin_specs:
            bin_edges = self.bin_specs['bin_edges']
            # NOTE: computation below also works with timestamps! Order is
            # important.
            bin_centers = []
            for i in range(len(bin_edges) - 1):
                bin_width = bin_edges[i + 1] - bin_edges[i]
                bin_width_half = bin_width / 2.
                bin_center = bin_edges[i] + bin_width_half
                bin_centers.append(bin_center)
        else:
            width = self.bin_specs['bin_width']
            offset = self.bin_specs.get('bin_offset', 0)
            bin_centers = [
                offset + (v[0] + 0.5) * width
                for v in self._val_counts.nononecounts.keys()
            ]
        return bin_centers

    def bin_centers(self):
        """Return bin centers.

        :returns: array of the bin centers
        :rtype: array
        """
        return np.array(self.get_nonone_bin_centers())

    def get_bin_count(self, bin_label):
        """Get bin count for specific bin label.

        :param bin_label: a specific key to find corresponding bin.
        :returns: bin counter value
        :rtype: int
        """
        return self._val_counts.nononecounts.get((bin_label, ), 0)

    def get_nonone_bin_counts(self):
        """Return bin counts.

        :returns: array of the bin counts
        :rtype: array
        """
        if not self.bin_specs or len(self.bin_specs) == 0:
            bin_counts = self._val_counts.nononecounts.values()
        elif 'bin_edges' in self.bin_specs:
            bin_edges = self.bin_specs['bin_edges']
            # NOTE: computation below also works with timestamps! Order is
            # important.
            bin_centers = []
            for i in range(len(bin_edges) - 1):
                bin_width = bin_edges[i + 1] - bin_edges[i]
                bin_width_half = bin_width / 2.
                bin_center = bin_edges[i] + bin_width_half
                bin_centers.append(bin_center)
            bin_counts = [self.get_hist_val(bc) for bc in bin_centers]
        else:
            bin_counts = self._val_counts.nononecounts.values()
        return bin_counts

    def bin_entries(self):
        """Return number of bin entries.

        Return the bin counts of the known bins in the value_counts object.

        :returns: array of the bin counts
        :rtype: array
        """
        return np.array(self.get_nonone_bin_counts())

    def get_bin_range(self):
        """Return the bin range.

        :returns: tuple of the bin range found
        :rtype: tuple
        """
        return self.get_nonone_bin_range()

    def get_nonone_bin_range(self):
        """Return the bin range.

        :returns: tuple of the bin range found
        :rtype: tuple
        """
        if not self.bin_specs:
            return None
        if 'bin_edges' in self.bin_specs:
            return self.bin_specs['bin_edges'][0], self.bin_specs['bin_edges'][
                -1]
        vals = [v[0] for v in self._val_counts.nononecounts]
        min_max = (vals[0], vals[-1])
        if self.bin_specs:
            width = self.bin_specs['bin_width']
            offset = self.bin_specs.get('bin_offset', 0.)
            return tuple(offset + m * width
                         for m in (min_max[0], min_max[1] + 1))
        return min_max

    def get_hist_val(self, var_value):
        """Get bin count for bin by value of histogram variable.

        :param var_value: a specific value to find corresponding bin.
        :returns: bin counter value
        :rtype: int
        """
        try:
            bin_label = self.value_to_bin_label(var_value)
        except Exception as exc:
            self.logger.error(
                'bin label for variable value "{value!s}" not found ({error})',
                value=var_value,
                error=exc)
            return 0
        return self.get_bin_count(bin_label)

    def get_bin_vals(self, variable_range=None, combine_values=True):
        """Get bin labels/edges and corresponding bin counts.

        Bin values corresponding to a given variable range.

        :param list variable_range: variable range used for finding the right bins to get values from. Optional.
        :param bool combine_values: if bin_specs is not set, combine existing bin labels with variable range.
        :returns: two arrays of bin values and bin edges
        :rtype: array
        """
        # check type of input arguments
        if variable_range is not None:
            variable_range = tuple(variable_range)
        combine_values = bool(combine_values)

        # create NumPy arrays of bin labels/edges
        if variable_range and not self.bin_specs:
            labs = bins = np.unique(np.append(self.get_bin_labels(), variable_range)) if combine_values else \
                np.array(variable_range)
        else:
            labs = np.array(self.get_bin_labels())
            bins = labs if not self.bin_specs else np.array(
                self.get_nonone_bin_edges())

        if variable_range and self.bin_specs:
            bins = self.truncated_bin_edges(
                variable_range=list(variable_range))
            # NOTE: computation below also works with timestamps! Order is
            # important.
            bin_centers = []
            for i in range(len(bins) - 1):
                bin_width = bins[i + 1] - bins[i]
                bin_width_half = bin_width / 2.
                bin_center = bins[i] + bin_width_half
                bin_centers.append(bin_center)
            labs = [self.value_to_bin_label(bc) for bc in bin_centers]

        return np.array([self.get_bin_count(v)
                         for v in labs]), np.asarray(bins)

    def remove_keys_of_inconsistent_type(self, prefered_key_type=None):
        """Remove all keys that have inconsistent data type(s).

        :param tuple prefered_key_type: the prefered key type to keep. Can be a tuple, list, or single type. E.g. str
                                        or (int,str,float). If None provided, the most common key type found is kept.
        """
        n_keys_prev = len(self._val_counts._cnts)
        self._val_counts.remove_keys_of_inconsistent_type(prefered_key_type)
        n_keys_new = len(self._val_counts._cnts)

        if n_keys_new < n_keys_prev:
            self.logger.info(
                'Removed "{n:d}" inconsistent keys out of "{n_total:d}", requiring "{type}" data type.',
                n=n_keys_prev - n_keys_new,
                n_total=n_keys_prev,
                type=prefered_key_type
                if prefered_key_type is not None else 'most common')

    def simulate(self, size, *args):
        """Simulate data using self (Histogram instance) as PDF.

        see https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_continuous.html

        :param int size: number of data points to generate
        :return numpy.array generated_data: the generated data
        :returns: Histogram of the generated data
        :rtype: Histogram
        """
        h_norm = self.to_normalized()
        hist, bins = h_norm.get_bin_vals(*args)
        values = np.random.rand(size)

        # check if Histogram is categorical or numeric
        if not self.bin_specs:
            generated_data = np.random.choice(bins, p=hist, size=size)
            vc = pd.Series(generated_data).value_counts() / float(
                len(generated_data))
            h_sim = Histogram((vc.values, vc.index.values), variable='x')
        else:
            # see
            # http://stackoverflow.com/questions/17821458/random-number-from-histogram
            bin_midpoints = bins[:-1] + np.diff(bins) / 2
            cdf = np.cumsum(hist)
            generated_data = np.searchsorted(cdf, values)
            random_from_cdf = bin_midpoints[generated_data]
            h_sim = Histogram(np.histogram(random_from_cdf, bins=bins),
                              variable=self.variable)
        return generated_data, h_sim

    def surface(self):
        """Calculate surface of the histogram.

        :returns: surface
        """
        if not self.bin_specs:
            raise NotImplementedError(
                'Surface for categorical histograms is not implemented.')
        hist, bin_edges = self.get_bin_vals()
        bin_widths = np.diff(bin_edges)
        return np.multiply(hist, bin_widths).sum()

    def copy(self, **kwargs):
        """Return a copy of this histogram.

        :param str variable: assign new variable name
        """
        new_var_name = str(kwargs.pop('variable', self.variable))
        return Histogram(counts=self.get_bin_vals(**kwargs),
                         variable=new_var_name)

    def to_normalized(self, **kwargs):
        """Return a normalized copy of this histogram.

        :param str new_var_name: assign new variable name
        :param list variable_range: variable range used for finding the right bins to get values from.
        :param bool combine_values: if bin_specs is not set, combine existing bin labels with variable range.
        """
        # convert to normalized histogram
        new_var_name = str(kwargs.pop('variable', self.variable))
        bin_vals = self.get_bin_vals(**kwargs)
        values = np.float64(bin_vals[0]) / bin_vals[0].sum()
        # When values is a numpy array of 1 element np.float64() returns a 0-dimensional array. See
        # https://github.com/numpy/numpy/issues/3161. The following
        # if-statement is a workaround for this issue.
        if not values.shape:
            values = values.reshape((1, ))
        return Histogram(counts=(values, bin_vals[1]), variable=new_var_name)

    @classmethod
    def combine_hists(cls,
                      hists,
                      labels=False,
                      rel_bin_width_tol=1e-6,
                      **kwargs):
        """Combine a set of histograms.

        :param array hists: array of Histograms to add up.
        :param label labels: histograms to add up have labels? (else are numeric) Default is False.
        :param str variable: name of variable described by the summed-up histogram
        :param float rel_bin_width_tol: relative tolerance between numeric bin edges.
        :returns: summed up histogram
        :rtype: Histogram
        """
        # parse input arguments
        new_var_name = str(kwargs.pop('variable', 'x'))
        cls.logger.debug(
            'Combining histograms into one new histogram for "{name}".',
            name=new_var_name)

        try:
            bin_vals = [h.get_bin_vals(**kwargs) for h in hists]
        except Exception:
            cls.logger.fatal(
                'Unable to get bin values from specified histograms ({hists!s}).',
                hists=hists)
            raise RuntimeError('Invalid input histograms specified.')
        if not bin_vals:
            raise RuntimeError('No input histograms specified.')

        # check histogram bins
        ref_bins = bin_vals[0][1]
        if labels:
            if not all(np.array_equal(ref_bins, h[1]) for h in bin_vals):
                cls.logger.fatal(
                    'Histograms with different binnings specified: {hists}.',
                    hists=', '.join(str(h[1]) for h in hists))
                raise RuntimeError(
                    'histograms with different binnings specified')
        else:
            ref_bin_width = hists[0].bin_specs['bin_width']
            atol = rel_bin_width_tol * ref_bin_width
            if not all(
                    np.allclose(ref_bins, h[1], rtol=0, atol=atol)
                    for h in bin_vals):
                cls.logger.fatal(
                    'Histograms with different binnings specified: {hists}.',
                    hists=', '.join(str(h[1]) for h in hists))
                raise RuntimeError(
                    'histograms with different binnings specified')

        # return histogram with sum of bin counts
        return cls(counts=(np.sum((h[0] for h in bin_vals), axis=0), ref_bins),
                   variable=new_var_name)

    def _from_value_counts(self, counts):
        """Create Histogram from ValueCounts instance.

        :param ValueCounts counts: value counts
        """
        # initialize value counts from ValueCounts object
        if self.variable not in counts.key:
            self.logger.fatal(
                'Variable "{var}" not in value counts with variables ({vars})',
                var=self.variable,
                vars=', '.join('"{}"'.format(v) for v in counts.key))
            raise RuntimeError('specified variable and counts do not match')
        self._val_counts = counts.create_sub_counts((self.variable, ))

    def _from_dict(self, counts):
        """Create Histogram from dictionary.

        :param dict counts: value counts
        """
        # initialize value counts from dictionary
        counts = dict((k if isinstance(k, tuple) else (k, ), v)
                      for k, v in counts.items())
        if not all(len(k) == 1 for k in counts):
            self.logger.fatal(
                'Specified counts dictionary contains keys with multiple variable values.'
            )
            raise AssertionError('Invalid format for counts keys.')
        _check_num_vals(iter(counts.values()))
        self._val_counts = ValueCounts((self.variable, ), (self.variable, ),
                                       counts)

    def _from_numpy(self, counts, bin_edges):
        """Create Histogram from NumPy-style histogram.

        :param array counts: numpy histogram counts array
        :param array bin_edges: bin edges
        """
        # initialize from NumPy-style histogram
        _check_num_vals(counts)
        if len(counts) == len(bin_edges) - 1:
            # interpret specified variable values as bin edges
            del self._bin_specs
            self.bin_specs = {'bin_edges': list(bin_edges)}
            bin_edges = list(range(len(counts)))
        elif len(counts) != len(bin_edges):
            # cannot interpret specified variable values as bin values
            self.logger.fatal(
                'Numbers of specified variable values ({n:d}) and value counts ({counts:d}) do not match',
                n=len(bin_edges),
                counts=len(counts))
            raise AssertionError(
                'specified variable values and value counts do not match')
        self._val_counts = ValueCounts(
            (self.variable, ), (self.variable, ),
            dict(((v, ), c) for c, v in zip(counts, bin_edges)))
Beispiel #7
0
class Processor(metaclass=ABCMeta):
    """Processor metaclass."""

    logger = Logger()  # type: Logger

    def __init__(self, name: str):
        """Initialize the Processor object."""
        super().__init__()
        name = name or self.__class__.__name__
        self.__name = name  # type: str
        self.__hash = None  # type: int
        self.__parent = None

    def __str__(self) -> str:
        return self.__name

    def __repr__(self) -> str:
        return '<{klass!s} name={name!s} parent={parent!r} id={id!s}>'.format(
            klass=self.__class__.__name__,
            name=self.name,
            parent=self.__parent,
            id=id(self))

    def __eq__(self, other: 'Processor') -> bool:
        return isinstance(other, type(self)) and self.__name == other.__name

    def __hash__(self) -> int:
        if self.__hash is None:
            self.__hash = hash((type(self), self.__name))

        return self.__hash

    def _initialize(self):
        """Wrapper to call user implemented initialize."""
        self.logger.debug('Initializing link "{link!s}".', link=self)

        status = self.initialize()

        if status == StatusCode.Success:
            self.logger.debug('Successfully initialized link "{link!s}".',
                              link=self)

        return status

    @abstractmethod
    def initialize(self):
        """Initialization logic for processor."""
        raise NotImplementedError

    def _execute(self):
        """Wrapper to call user implemented execute."""
        self.logger.debug('Executing link "{link!s}".', link=self)

        status = self.execute()

        if status == StatusCode.Success:
            self.logger.debug('Successfully executed link "{link!s}".',
                              link=self)

        return status

    @abstractmethod
    def execute(self):
        """Execution logic for processor."""
        raise NotImplementedError

    def _finalize(self):
        """Wrapper to call user implemented finalize."""
        self.logger.debug('Finalizing link "{link!s}".', link=self)

        status = self.finalize()

        if status == StatusCode.Success:
            self.logger.debug('Successfully finalized link "{link!s}".',
                              link=self)

        return status

    @abstractmethod
    def finalize(self):
        """Finalization logic for processor."""
        raise NotImplementedError

    @property
    def name(self) -> str:
        """Get the name of processor.

        :return: The name of the processor.
        :rtype: str
        """
        return self.__name

    @property
    def parent(self):
        """Get the group parent.

        :return: The parent/group processor sequence.
        """
        return self.__parent

    @parent.setter
    def parent(self, the_parent) -> None:
        """Set the group parent.

        :param the_parent: The parent/group processor sequence.
        """
        self.__parent = None

        if the_parent is not None:
            # The parent will most likely outlive the processor
            # and therefore we do not want keep a strong reference
            # to the parent.
            self.__parent = proxy(the_parent)
Beispiel #8
0
class SparkAnalysisTutorialMacrosTest(TutorialMacrosTest):
    """Integration tests based on spark-analysis tutorial macros."""
    logger = Logger()

    def setUp(self):
        """Set up test."""
        TutorialMacrosTest.setUp(self)
        settings = process_manager.service(ConfigObject)
        settings['analysisName'] = 'SparkAnalysisTutorialMacrosTest'

        # ensure local testing
        spark_settings = [('spark.app.name', settings['analysisName']),
                          ('spark.master', 'local[*]'),
                          ('spark.driver.host', 'localhost')]
        process_manager.service(SparkManager).create_session(
            eskapade_settings=settings, spark_settings=spark_settings)

    def tearDown(self):
        """Tear down test environment."""
        process_manager.service(SparkManager).finish()
        TutorialMacrosTest.tearDown(self)

    def test_esk601(self):
        """Test Esk-601: Configure Spark."""
        # ensure no running Spark instance
        process_manager.service(SparkManager).finish()

        # run Eskapade
        self.eskapade_run(resources.tutorial('esk601_spark_configuration.py'))

        sc = process_manager.service(SparkManager).get_session().sparkContext

        # check configuration
        self.assertEqual(sc.getConf().get('spark.app.name'),
                         'esk601_spark_configuration_link',
                         'SparkConf.setAll() not picked up correctly')
        self.assertEqual(sc.getConf().get('spark.master'), 'local[42]',
                         'SparkConf.setAll() not picked up correctly')
        self.assertEqual(sc.getConf().get('spark.driver.host'), '127.0.0.1',
                         'SparkConf.setAll() not picked up correctly')

        # stop spark manager
        process_manager.service(SparkManager).finish()

    def test_esk602(self):
        """Test Esk-602: Read CSV files into a Spark data frame."""
        # check if running in local mode
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.eskapade_run(resources.tutorial('esk602_read_csv_to_spark_df.py'))
        ds = process_manager.service(DataStore)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].rdd.getNumPartitions(), 5,
                         'unexpected number of partitions in data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(ds['spark_df'].columns, ['date', 'loc', 'x', 'y'],
                             'unexpected columns in data frame')
        self.assertSetEqual(
            set((r['date'], r['loc']) for r in ds['spark_df'].collect()),
            set([(20090101, 'a'), (20090102, 'b'), (20090103, 'c'),
                 (20090104, 'd'), (20090104, 'e'), (20090106, 'a'),
                 (20090107, 'b'), (20090107, 'c'), (20090107, 'd'),
                 (20090108, 'e'), (20090109, 'e'), (20090109, 'f')]),
            'unexpected values in date/loc columns')

    def test_esk603(self):
        """Test Esk-603: Write Spark data to CSV."""
        # check if running in local mode
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.eskapade_run(
            resources.tutorial('esk603_write_spark_data_to_csv.py'))

        # read output data
        results_data_path = persistence.io_dir('results_data')
        names = []
        headers = []
        contents = []
        csv_dirs = glob.glob('{}/*'.format(results_data_path))
        self.assertEqual(len(csv_dirs), 3,
                         'expected to find three CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            self.assertEqual(
                len(csv_files), 1,
                'expected to find only one CSV file in "{}"'.format(names[-1]))
            with open(csv_files[0]) as csv:
                contents.append([l.strip().split(',') for l in csv])
                self.assertEqual(
                    len(contents[-1]), 101,
                    'unexpected number of lines in "{}" CSV'.format(names[-1]))
                headers.append(contents[-1][0])
                contents[-1] = sorted(contents[-1][1:])

        # check output data
        self.assertListEqual(headers[0], ['index', 'foo', 'bar'],
                             'unexpected CSV header for "{}"'.format(names[0]))
        self.assertListEqual(
            contents[0],
            sorted([str(it), 'foo{:d}'.format(it),
                    str((it + 1) / 2.)] for it in range(100)),
            'unexpected CSV content for "{}"'.format(names[0]))
        for name, head, cont in zip(names[1:], headers[1:], contents[1:]):
            self.assertListEqual(
                head, headers[0],
                'CSV header of "{0:s}" differs from header of "{1:s}"'.format(
                    name, names[0]))
            self.assertListEqual(
                cont, contents[0],
                'CSV content of "{0:s}" differs from content of "{1:s}"'.
                format(name, names[0]))

    def test_esk604(self):
        """Test Esk-604: Execute Spark-SQL query."""
        # check if running in local mode
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.eskapade_run(resources.tutorial('esk604_spark_execute_query.py'))
        ds = process_manager.service(DataStore)

        # check data frame
        self.assertIn('spark_df_sql', ds,
                      'no object with key "spark_df_sql" in data store')
        self.assertIsInstance(ds['spark_df_sql'], pyspark.sql.DataFrame,
                              '"spark_df_sql" is not a Spark data frame')
        self.assertEqual(ds['spark_df_sql'].count(), 4,
                         'unexpected number of rows in filtered data frame')
        self.assertListEqual(ds['spark_df_sql'].columns,
                             ['loc', 'sumx', 'sumy'],
                             'unexpected columns in data frame')
        self.assertEqual(
            ds['spark_df_sql'].schema,
            process_manager.get('ApplySQL').get('SparkSQL').schema,
            'schema of data frame does not correspond to schema stored in link'
        )
        self.assertSetEqual(
            set(tuple(r) for r in ds['spark_df_sql'].collect()),
            set([('e', 10, 15), ('d', 2, 11), ('b', 6, 16), ('a', 2, 18)]),
            'unexpected values in loc/sumx/sumy columns')

    def test_esk605(self):
        """Test Esk-605: Create Spark data frame."""
        # run Eskapade
        self.eskapade_run(resources.tutorial('esk605_create_spark_df.py'))
        ds = process_manager.service(DataStore)

        # check created data frames
        cols = (StructField('index',
                            LongType()), StructField('foo', StringType()),
                StructField('bar', DoubleType()))
        rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.)
                for it in range(20, 100)]
        for key in ('rows_df', 'rdd_df', 'df_df', 'pd_df'):
            self.assertIn(key, ds,
                          'no object with key {} in data store'.format(key))
            df = ds[key]
            self.assertIsInstance(
                df, pyspark.sql.DataFrame,
                'object with key {0:s} is not a data frame (type {1!s})'.
                format(key, type(df)))
            self.assertTupleEqual(
                tuple(df.schema), cols,
                'unexpected data-frame schema for {}'.format(key))
            self.assertListEqual(
                sorted(tuple(r) for r in df.collect()), rows,
                'unexpected data-frame content for {}'.format(key))
            self.assertTrue(df.is_cached,
                            'data frame {} is not cached'.format(key))
            self.assertLessEqual(
                df.rdd.getNumPartitions(), 2,
                'unexpected number of data-frame partitions for {}'.format(
                    key))

    def test_esk606(self):
        """Test Esk-606: Convert Spark data frame."""
        # run Eskapade
        self.eskapade_run(resources.tutorial('esk606_convert_spark_df.py'))
        ds = process_manager.service(DataStore)

        # define types of stored data sets
        data_types = {
            'df': pyspark.sql.DataFrame,
            'rdd': pyspark.RDD,
            'list': list,
            'pd': pd.DataFrame
        }

        # define functions to obtain data-frame content
        content_funcs = {
            'df': lambda d: sorted(d.rdd.map(tuple).collect()),
            'rdd': lambda d: sorted(d.collect()),
            'list': lambda d: sorted(d),
            'pd': lambda d: sorted(map(tuple, d.values))
        }

        # check input data
        self.assertIn('df', ds, 'no data found with key "df"')
        self.assertIsInstance(ds['df'], pyspark.sql.DataFrame,
                              'unexpected type for input data frame')

        # check created data sets
        rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.)
                for it in range(20, 100)]
        for key, dtype in data_types.items():
            # check content
            dkey = '{}_output'.format(key)
            self.assertIn(dkey, ds, 'no data found with key "{}"'.format(dkey))
            self.assertIsInstance(ds[dkey], dtype,
                                  'unexpected type for "{}" data'.format(key))
            self.assertListEqual(
                content_funcs[key](ds[dkey]), rows,
                'unexpected content for "{}" data'.format(key))

            # check schema
            skey = '{}_schema'.format(key)
            self.assertIn(skey, ds, 'no schema found with key {}'.format(skey))
            self.assertListEqual(list(ds[skey]), list(ds['df'].schema),
                                 'unexpected schema for "{}" data'.format(key))

    def test_esk607(self):
        """Test Esk-607: Add column to Spark dataframe."""
        # check if running in local mode
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.eskapade_run(resources.tutorial('esk607_spark_with_column.py'))
        ds = process_manager.service(DataStore)

        # check data frame
        self.assertIn('new_spark_df', ds,
                      'no object with key "new_spark_df" in data store')
        self.assertIsInstance(ds['new_spark_df'], pyspark.sql.DataFrame,
                              '"new_spark_df" is not a Spark data frame')
        self.assertEqual(ds['new_spark_df'].count(), 5,
                         'unexpected number of rows in filtered data frame')
        self.assertListEqual(
            ds['new_spark_df'].columns,
            ['dummy', 'date', 'loc', 'x', 'y', 'pow_xy1', 'pow_xy2'],
            'unexpected columns in data frame')
        self.assertSetEqual(
            set(tuple(r) for r in ds['new_spark_df'].collect()),
            set([('bla', 20090103, 'c', 5, 7, 78125.0, 78125.0),
                 ('bal', 20090102, 'b', 3, 8, 6561.0, 6561.0),
                 ('flo', 20090104, 'e', 3, 5, 243.0, 243.0),
                 ('bar', 20090101, 'a', 1, 9, 1.0, 1.0),
                 ('foo', 20090104, 'd', 1, 6, 1.0, 1.0)]),
            'unexpected values in columns')

    def test_esk608(self):
        """Test Esk-608: Execute Spark histogram filling macro."""
        # check if required Python and Java libraries are made available to worker nodes
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )
        self.assertRegex(
            sc.getConf().get('spark.jars.packages', ''),
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4',
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail'
        )

        # run Eskapade
        self.eskapade_run(resources.tutorial('esk608_spark_histogrammar.py'))
        ds = process_manager.service(DataStore)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(sorted(ds['spark_df'].columns),
                             sorted(['date', 'loc', 'x', 'y']),
                             'unexpected columns in data frame')

        # data-generation checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        col_names = ['date', 'x', 'y', 'loc', 'x:y']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names))

        # data-summary checks
        f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y']
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in f_bases]
        for fname in file_names:
            path = persistence.io_path('results_data',
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)

    def test_esk609(self):
        """Test Esk-609: Map data-frame groups."""
        # run Eskapade
        self.eskapade_run(resources.tutorial('esk609_map_df_groups.py'))
        ds = process_manager.service(DataStore)

        # check input data
        for key in ('map_rdd', 'flat_map_rdd'):
            self.assertIn(key, ds, 'no data found with key "{}"'.format(key))
            self.assertIsInstance(
                ds[key], pyspark.RDD,
                'object "{0:s}" is not an RDD (type "{1!s}")'.format(
                    key, type(ds[key])))

        # sums of "bar" variable
        bar_sums = [(0, 27.5), (1, 77.5), (2, 127.5), (3, 177.5), (4, 227.5),
                    (5, 277.5), (6, 327.5), (7, 377.5), (8, 427.5), (9, 477.5)]
        flmap_rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.,
                       bar_sums[it // 10][1]) for it in range(100)]

        # check mapped data frames
        self.assertListEqual(sorted(ds['map_rdd'].collect()), bar_sums,
                             'unexpected values in "map_rdd"')
        self.assertListEqual(sorted(ds['flat_map_rdd'].collect()), flmap_rows,
                             'unexpected values in "flat_map_rdd"')

    def test_esk610(self):
        """Test Esk-610: Spark Streaming word count."""
        # this test relies on linux shell scripts to create file stream
        if (sys.platform != 'linux') and (sys.platform != 'darwin'):
            self.logger.debug(
                'skipping test_esk610 for non-unix {} platform'.format(
                    sys.platform))
            return

        # check if running in local mode
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # create test dir
        tmpdir = '/tmp/eskapade_stream_test'
        os.mkdir(tmpdir)

        def remove_tmp():
            # clean up files
            shutil.rmtree(tmpdir)

        self.addCleanup(remove_tmp)

        # create a file stream
        tmpfile = ''.join(
            random.choice(string.ascii_lowercase) for _ in range(8))
        cmd = 'for i in $(seq -f \"%05g\" 0 1000); \
                do echo \'Hello world\' > "{}"/"{}"_$i.dummy; \
                        sleep 1; done'.format(tmpdir, tmpfile)
        p = subprocess.Popen(cmd,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)

        # run eskapade
        self.eskapade_run(
            resources.tutorial('esk610_spark_streaming_wordcount.py'))
        ds = process_manager.service(DataStore)

        # end file stream
        p.kill()

        # check if file stream was properly executed
        stdout, stderr = p.communicate()
        self.assertEqual(stdout, b'',
                         'unexpected stdout output {}'.format(stdout))
        self.assertEqual(stderr, b'',
                         'unexpected stderr output {}'.format(stderr))

        # check if stream was setup correctly (that's all we can do - the data itself is gone)
        self.assertIsInstance(ds['dstream'], pyspark.streaming.DStream)

        # read and check output data
        results_data_path = persistence.io_dir('results_data')
        names = []
        contents = []
        csv_dirs = glob.glob(
            '{}/dstream/wordcount-*.txt'.format(results_data_path))
        self.assertGreater(len(csv_dirs), 0,
                           'expected to find CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            # self.assertEqual(len(csv_files), 1, 'expected to find exactly one CSV file in "{}"'.format(names[-1]))
            if csv_files:
                with open(csv_files[0]) as csv:
                    record = [l for l in csv]
                    if record:  # empty records are allowed (because of timing differences)
                        self.assertRegex(
                            record[0], 'Hello',
                            'Expected \'Hello\' as in \'Hello world\'')
                        self.assertRegex(
                            record[1], 'world',
                            'Expected \'world\' as in \'Hello world\'')
                    contents.append(record[:])
        self.assertGreater(
            len(contents), 0,
            'expected ~ten items (each second a streaming RDD) - depending on timing'
        )
Beispiel #9
0
class ProcessService(metaclass=ProcessServiceMeta):
    """Base class for process services."""

    logger = Logger()
    _persist = False

    def __init__(self):
        """Initialize service instance."""
        pass

    def __str__(self):
        """Get printable specification of service instance."""
        return '{0!s} ({1:s})'.format(type(self), hex(id(self)))

    @classmethod
    def create(cls):
        """Create an instance of this service.

        :returns: service instance
        :rtype: ProcessService
        """
        # create instance and make sure the service is initialized
        inst = cls()
        ProcessService.__init__(inst)
        return inst

    def finish(self):
        """Finish current processes.

        This function can be implemented by a process-service implementation to
        finish running processes and clean up to prepare for a reset of the
        process manager.  This would typically involve deleting large objects
        and closing files and database connections.
        """
        pass

    @classmethod
    def import_from_file(cls, file_path):
        """Import service instance from a Pickle file.

        :param str file_path: path of Pickle file
        :returns: imported service instance
        :rtype: ProcessService
        :raises: RuntimeError, TypeError
        """
        # check if service can be persisted
        if cls.persist:
            cls.logger.debug('Importing service instance of "{cls!s}" from file "{path}".', cls=cls, path=file_path)
        else:
            cls.logger.debug('Not importing service "{cls!s}".', cls=cls)
            return None

        # check specified file path
        if not os.path.isfile(file_path):
            cls.logger.fatal('Specified path for importing "{cls!s}" instance is not a file "{path}".',
                             cls=cls, path=file_path)
            raise RuntimeError('Invalid file path specified for importing process service.')

        try:
            # try to open file and import instance
            with open(file_path, 'rb') as inst_file:
                inst = pickle.load(inst_file)
        except Exception as exc:
            # re-raise exeption if import failed
            cls.logger.warning('Failed to import service instance of "{cls!s}" from file "{path}".',
                               cls=cls, path=file_path)
            raise exc

        # check type of instance
        if not isinstance(inst, cls):
            cls.logger.fatal('Expected to import "{cls!s}" instance, got object of type "{type}".',
                             cls=cls, type=type(inst).__name__)
            raise TypeError('Incorrect type for imported service object.')

        return inst

    def persist_in_file(self, file_path):
        """Persist service instance in Pickle file.

        :param str file_path: path of Pickle file
        """
        # check if service can be persisted
        if type(self).persist:
            self.logger.debug('Persisting service instance "{instance!s}" in file "{path}".',
                              instance=self, path=file_path)
        else:
            self.logger.debug('Not persisting service "{type!s}".', type=type(self))
            return

        try:
            # try to persist
            with open(file_path, 'wb') as inst_file:
                pickle.dump(self, inst_file)
        except Exception as exc:
            # give warning if persisting failed
            self.logger.warning('Failed to persist service instance "{instance!s}" in file "{path}".',
                                instance=self, path=file_path)
            self.logger.warning('Caught exception "{exc!s}".', exc=exc)
Beispiel #10
0
class ArrayStats:
    """Create summary of an array.

    Class to calculate statistics (mean, standard deviation, percentiles,
    etc.) and create a histogram of values in an array.
    The statistics can be returned as values in a dictionary, a
    printable string, or as a LaTeX string.
    """

    logger = Logger()

    def __init__(self, data, col_name, weights=None, unit='', label=''):
        """Initialize for a single column in data frame.

        :param data: Input array
        :type data: iterable
        :type data: pandas Dataframe
        :type data: (keys of) dict
        :param col_name: column name
        :param weights: Input array (default None)
        :type weights: iterable
        :type weights: string (column of data)
        :param unit: Unit of column
        :param str label: Label to describe column variable
        :raises: TypeError
        """
        # set initial values of attributes
        self.stat_vars = []
        self.stat_vals = {}
        self.print_lines = []
        self.latex_table = []

        # parse arguments
        self.name = str(col_name)
        self.unit = str(unit)
        self.label = str(label)
        self.col = data[self.name] if isinstance(data, pd.DataFrame) else data
        self.weights = data[weights] if isinstance(weights, str) else weights
        if isinstance(data, dict):
            self.col = sorted(data.keys())
            self.weights = [data[k] for k in self.col]

        # check if column is iterable
        try:
            iter(self.col)
        except TypeError:
            raise TypeError('Specified data object is not iterable.')
        if self.weights is not None:
            try:
                iter(self.weights)
            except TypeError:
                raise TypeError('Specified weights object is not iterable.')

        # check sizes of data and weights
        if self.weights is not None and len(self.col) != len(self.weights):
            raise AssertionError(
                'weights and data do not have the same length.')

        # store data and weights in a Pandas Series
        if not isinstance(self.col, pd.Series):
            self.col = pd.Series(val for val in self.col)
        if self.weights is not None:
            if not isinstance(self.weights, pd.Series):
                self.weights = pd.Series(w for w in self.weights)

        # store non-null column values
        self.col_nn = self.col[self.col.notnull()]
        self.weights_nn = self.weights[
            self.col.notnull()] if self.weights is not None else None

        # to be filled in make_histogram
        self.hist = None

    def get_col_props(self):
        """Get column properties.

        :returns dict: Column properties
        """
        return get_col_props(self.col.dtype)

    def create_mpv_stat(self):
        """Compute most probable value from histogram.

        This function computes the most probable value based on the histogram
        from make_histogram(), and adds it to the statistics.
        """
        # basic checks
        if self.hist is None:
            self.logger.warning(
                'Internal histogram is not filled. Run make_histogram() first.'
            )
            return
        if len(self.hist) != 2:
            raise AssertionError(
                'Internal histogram needs to consist of two arrays.')
        values, bins = self.hist
        if not isinstance(values, np.ndarray) and not isinstance(values, list):
            raise TypeError('Values should be a list or numpy array.')
        if not isinstance(bins, np.ndarray) and not isinstance(bins, list):
            raise TypeError('Bins should be a list or numpy array.')
        if len(bins) != len(values) and len(bins) != len(values) + 1:
            raise AssertionError('Bins and values have inconsistent lengths.')

        # if two max elements are equal, this will return the element with the lowest index.
        max_idx = max(enumerate(values), key=lambda x: x[1])[0]
        bc = bins[max_idx]

        # determine column properties
        col_props = self.get_col_props()

        if col_props['is_num'] and len(bins) == len(values) + 1:
            # shift to bin center. note: this also works for timestamps.
            bc += (bins[max_idx + 1] - bc) / 2

        # append statistics
        mpv_name = 'mpv'
        self.stat_vars.append(mpv_name)
        self.stat_vals[mpv_name] = (bc, '{!s}'.format(bc))
        name_len = max(len(n) for n in self.stat_vars)
        self.print_lines.append(
            '{{0:{:d}s}} : {{1:s}}'.format(name_len).format(
                mpv_name, self.stat_vals[mpv_name][1]))

    def create_stats(self):
        """Compute statistical properties of column variable.

        This function computes the statistical properties of values in the
        specified column.  It is called by other functions that use the
        resulting figures to create a statistical overview.
        """
        # reset stats containers
        self.stat_vars = []
        self.stat_vals = {}
        self.print_lines = []
        self.latex_table = []

        # determine column properties
        col_props = self.get_col_props()

        # get value counts
        cnt, var_cnt, dist_cnt = (len(self.col), len(self.col_nn),
                                  self.col.nunique())
        if self.weights_nn is not None:
            cnt, var_cnt = int(sum(self.weights)), int(sum(self.weights_nn))
        for stat_var, stat_val in zip(('count', 'filled', 'distinct'),
                                      (cnt, var_cnt, dist_cnt)):
            self.stat_vars.append(stat_var)
            self.stat_vals[stat_var] = (stat_val, '{:d}'.format(stat_val))
        n_nan = self.col.isnull().sum()
        if n_nan:
            self.stat_vars.append('nan')
            self.stat_vals['nan'] = (n_nan, '{:d}'.format(n_nan))
        # add value counts to print lines
        self.print_lines.append(
            '{}:'.format(self.label if self.label else self.name))
        ratio = (var_cnt / cnt) * 100 if cnt != 0 else 0
        self.print_lines.append('{0:d} entries ({1:.0f}%)'.format(
            var_cnt, ratio))
        self.print_lines.append('{0:d} unique entries'.format(dist_cnt))

        # convert time stamps to integers
        if col_props['is_ts']:
            col_num = self.col_nn.astype(int)
        else:
            col_num = self.col_nn

        # get additional statistics for numeric variables
        if col_props['is_num'] and len(col_num):
            stat_vars = ('mean', 'std', 'min', 'max', 'p01', 'p05', 'p16',
                         'p50', 'p84', 'p95', 'p99')
            quant_probs = (0, 1, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99)
            # stat_vals = (col_num.mean(), col_num.std(), col_num.min(), col_num.max())\
            #            + tuple(col_num.quantile((0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99)))
            # two lines below also work if weights are None
            des = DescrStatsW(col_num, self.weights_nn)
            stat_vals = (des.mean, des.std) + tuple(
                weighted_quantile(col_num, self.weights_nn, quant_probs))
            self.stat_vars += stat_vars
            for stat_var, stat_val in zip(stat_vars, stat_vals):
                if not col_props['is_ts']:
                    # value entry for floats and integers
                    self.stat_vals[stat_var] = (stat_val,
                                                '{:+g}'.format(stat_val))
                else:
                    if stat_var != 'std':
                        # display time stamps as date/time strings
                        self.stat_vals[stat_var] = (pd.Timestamp(
                            int(stat_val)), str(pd.Timestamp(int(stat_val))))
                    else:
                        # display time-stamp range as number of days
                        stat_val /= NUM_NS_DAY
                        self.stat_vals[stat_var] = (stat_val,
                                                    '{:g}'.format(stat_val))

            # append statistics to print lines
            name_len = max(len(n) for n in stat_vars)
            for stat_var in stat_vars:
                self.print_lines.append(
                    '{{0:{:d}s}} : {{1:s}}'.format(name_len).format(
                        stat_var, self.stat_vals[stat_var][1]))

    def get_print_stats(self, to_output=False):
        """Get statistics in printable form.

        :param bool to_output: Print statistics to output stream?
        :returns str: Printable statistics string
        """
        # create statistics print lines
        if not self.stat_vals:
            self.create_stats()

        # create printable string
        print_str = '\n'.join(self.print_lines) + '\n'
        if to_output:
            print(print_str)

        return print_str

    def get_latex_table(self, get_stats=None, latex=True):
        """Get LaTeX code string for table of stats values.

        :param list get_stats: List of statistics that you want to filter on. (default None (all stats))
                               Available stats are: 'count', 'filled', 'distinct', 'mean', 'std', 'min', 'max',
                               'p05', 'p16', 'p50', 'p84', 'p95', 'p99'
        :param bool latex: LaTeX output or list output (default True)
        :returns str: LaTeX code snippet
        """
        # create statistics print lines
        if not self.stat_vals:
            self.create_stats()

        # create LaTeX string
        table = [(stat_var, self.stat_vals[stat_var][1])
                 for stat_var in self.stat_vars]
        if get_stats is not None:
            table = [(var, val) for var, val in table if var in get_stats]
        if latex:
            return tabulate.tabulate(table, tablefmt='latex')
        else:
            return table

    def get_x_label(self):
        """Get x label."""
        x_lab = self.label if self.label else self.name
        if self.unit:
            x_lab += ' [{}]'.format(self.unit)
        return x_lab

    def make_histogram(self,
                       var_bins=30,
                       var_range=None,
                       bin_edges=None,
                       create_mpv_stat=True):
        """Create histogram of column values.

        :param int var_bins: Number of histogram bins
        :param tuple var_range: Range of histogram variable
        :param list bin_edges: predefined bin edges to use for histogram. Overrules var_bins.
        """
        # create statistics overview
        if not self.stat_vals:
            self.create_stats()

        # determine column properties
        col_props = self.get_col_props()

        if col_props['is_num']:
            col_num = self.col_nn

            # determine histogram range for numeric variable
            if var_range:
                # get minimum and maximum of variable for histogram from specified range
                var_min, var_max = var_range
                if col_props['is_ts']:
                    # convert minimum and maximum to Unix time stamps
                    var_min, var_max = pd.Timestamp(
                        var_min).value, pd.Timestamp(var_max).value
            else:
                # determine minimum and maximum of variable for histogram from percentiles
                var_min, var_max = self.stat_vals.get(
                    'p05')[0], self.stat_vals.get('p95')[0]
                if col_props['is_ts']:
                    var_min, var_max = pd.Timestamp(
                        var_min).value, pd.Timestamp(var_max).value
                var_min -= 0.05 * (var_max - var_min)
                var_max += 0.05 * (var_max - var_min)
                if 0. < var_min < +0.2 * (var_max - var_min):
                    var_min = 0.
                elif -0.2 * (var_max - var_min) < var_max < 0.:
                    var_max = 0.
                if np.isnan(var_min):
                    var_min = bin_edges[0]
                assert not np.isnan(var_min)
                if np.isnan(var_max):
                    var_max = bin_edges[-1]
                assert not np.isnan(var_max)

            if col_props['is_ts']:
                # np.histogram cannot deal with timestamps, so convert to ints and convert them back below.
                to_timestamp = np.vectorize(lambda x: pd.Timestamp(x).value)
                col_num = to_timestamp(self.col_nn)
                if bin_edges is not None:
                    bin_edges = (to_timestamp(bin_edges)).tolist()

            if bin_edges is not None:
                bin_util = BinningUtil(bin_edges=bin_edges)
                idx_min = bin_util.value_to_bin_label(var_min)
                var_min = bin_util.get_left_bin_edge(idx_min)
                idx_max = bin_util.value_to_bin_label(var_max)
                var_max = bin_util.get_right_bin_edge(idx_max)
                var_bins = bin_util.truncated_bin_edges(
                    variable_range=[var_min, var_max])
            else:
                if col_props['is_int'] or col_props['is_ts']:
                    # for ints and ts use bins around integer values
                    bin_width = np.max((np.round(
                        (var_max - var_min) / float(var_bins)), 1.))
                    var_min = np.floor(var_min - 0.5) + 0.5
                    var_bins = int((var_max - var_min) // bin_width) + int(
                        (var_max - var_min) % bin_width > 0.)
                    var_max = var_min + var_bins * bin_width

            # make (weighted) histogram. note that var_bins supersedes range
            values, bins = np.histogram(col_num,
                                        bins=var_bins,
                                        range=(var_min, var_max),
                                        weights=self.weights_nn)

            if col_props['is_ts']:
                # convert Unix time stamps to Pandas time stamps
                bins = [pd.Timestamp(ts) for ts in bins]
            self.hist = values, bins
        else:
            # get data from data frame for categorical column
            if self.weights_nn is None:
                val_counts = self.col_nn.value_counts(
                    sort=True).iloc[:var_bins].to_dict()
            else:
                val_counts = Counter()
                for k, v in zip(self.col_nn, self.weights_nn):
                    val_counts[k] += v
                val_counts = dict(val_counts.most_common(var_bins))
            sorted_vc = sorted(val_counts.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
            labels = [lc[0] for lc in sorted_vc]
            values = [lc[1] for lc in sorted_vc]
            self.hist = values, labels

        # compute most probable value from histogram and add to statistics
        if create_mpv_stat:
            self.create_mpv_stat()

        return self.hist