Example #1
0
    def lightgbm_model(self,
                       target,
                       features=None,
                       num_boost_round=100,
                       copy=False,
                       params={},
                       prediction_name='lightgbm_prediction'):
        '''Requires vaex.ml: create a lightgbm model and train/fit it.

        :param target: The name of the target column.
        :param features: List of features to use when training the model. If None, all columns except the target will be used as features.
        :param num_boost_round: Number of boosting rounds.
        :param bool copy: If True Copy the data, otherwise use a more memory efficient data transfer method.
        :return vaex.ml.lightgbm.LightGBMModel: Fitted LightGBM model.
        '''
        from .lightgbm import LightGBMModel
        dataframe = self.df
        target = _ensure_strings_from_expressions(target)
        features = features or self.df.get_column_names(
            virtual=True).remove(target)
        features = _ensure_strings_from_expressions(features)

        booster = LightGBMModel(prediction_name=prediction_name,
                                num_boost_round=num_boost_round,
                                features=features,
                                target=target,
                                params=params)
        booster.fit(dataframe, copy=copy)
        return booster
Example #2
0
    def catboost_model(self,
                       target,
                       features=None,
                       num_boost_round=100,
                       params=None,
                       prediction_name='catboost_prediction'):
        '''Requires vaex.ml: create a CatBoostModel model and train/fit it.

        :param target: The name of the target column.
        :param features: List of features to use when training the model. If None, all columns except the target will be used as features.
        :param num_boost_round: Number of boosting rounds.
        :return vaex.ml.catboost.CatBoostModel: Fitted CatBoostModel model.
        '''
        from .catboost import CatBoostModel
        dataframe = self.df
        target = _ensure_strings_from_expressions(target)
        features = features or self.df.get_column_names(
            virtual=True).remove(target)
        features = _ensure_strings_from_expressions(features)
        booster = CatBoostModel(prediction_name=prediction_name,
                                num_boost_round=num_boost_round,
                                features=features,
                                target=target,
                                params=params)
        booster.fit(dataframe)
        return booster
Example #3
0
 def expand(self, stop=[]):
     stop = _ensure_strings_from_expressions(stop)
     def translate(id):
         if id in self.ds.virtual_columns and id not in stop:
             return self.ds.virtual_columns[id]
     expr = expresso.translate(self.expression, translate)
     return Expression(self.ds, expr)
Example #4
0
    def pygbm_model(self,
                    label,
                    max_iter,
                    features=None,
                    param={},
                    classifier=False,
                    prediction_name='pygbm_prediction',
                    **kwargs):
        '''Requires vaex.ml: create a pygbm model and train/fit it.

        :param label: Label to train/fit on
        :param max_iter: Max number of iterations/trees
        :param features: List of features to train on
        :param bool classifier: If true, return a the classifier (will use argmax on the probabilities)
        :return vaex.ml.pygbm.PyGBMModel or vaex.ml.pygbm.PyGBMClassifier: Fitted PyGBM model
        '''
        from .incubator.pygbm import PyGBMModel, PyGBMClassifier
        dataframe = self.df
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        cls = PyGBMClassifier if classifier else PyGBMModel
        b = cls(prediction_name=prediction_name,
                max_iter=max_iter,
                features=features,
                param=param,
                **kwargs)
        b.fit(dataframe, label)
        return b
Example #5
0
 def expand(self, stop=[]):
     stop = _ensure_strings_from_expressions(stop)
     def translate(id):
         if id in self.ds.virtual_columns and id not in stop:
             return self.ds.virtual_columns[id]
     expr = expresso.translate(self.expression, translate)
     return Expression(self.ds, expr)
Example #6
0
    def lightgbm_model(self,
                       target,
                       num_boost_round,
                       features=None,
                       copy=False,
                       params={},
                       prediction_name='lightgbm_prediction'):
        '''Requires vaex.ml: create a lightgbm model and train/fit it.

        :param target: The target variable to predict.
        :param num_boost_round: Number of boosting iterations.
        :param features: List of features to train on.
        :param bool copy: Copy data or use the modified xgboost library for efficient transfer.
        :return vaex.ml.lightgbm.LightGBMModel: Fitted LightGBM model.
        '''
        from .lightgbm import LightGBMModel
        dataframe = self.df
        features = features or self.df.get_column_names(virtual=True)
        features = _ensure_strings_from_expressions(features)

        booster = LightGBMModel(prediction_name=prediction_name,
                                num_boost_round=num_boost_round,
                                features=features,
                                params=params)
        booster.fit(dataframe, target, copy=copy)
        return booster
Example #7
0
def iter_vaex(
    X: vaex.dataframe.DataFrame,
    y: typing.Union[str, vaex.expression.Expression] = None,
    features: typing.Union[typing.List[str],
                           vaex.expression.Expression] = None,
) -> base.typing.Stream:
    """Yields rows from a ``vaex.DataFrame``.

    Parameters
    ----------
    X
        A vaex DataFrame housing the training featuers.
    y
        The column or expression containing the target variable.
    features
        A list of features used for training. If None, all columns in `X` will be used. Features
        specifying in `y` are ignored.

    """

    features = _ensure_strings_from_expressions(features)
    feature_names = features or X.get_column_names()

    if y:
        y = _ensure_strings_from_expressions(y)
        y = _ensure_list(y)
        feature_names = [feat for feat in feature_names if feat not in y]

    multioutput = len(y) > 1

    if multioutput:
        for i in range(len(X)):
            yield (
                {key: X.evaluate(key, i, i + 1)[0]
                 for key in feature_names},
                {key: X.evaluate(key, i, i + 1)[0]
                 for key in y},
            )

    else:

        for i in range(len(X)):
            yield (
                {key: X.evaluate(key, i, i + 1)[0]
                 for key in feature_names},
                X.evaluate(y[0], i, i + 1)[0],
            )
Example #8
0
def label_encoder(self, features=None, prefix='label_encoded_'):
    '''Requires vaex.ml: Create :class:`vaex.ml.transformations.LabelEncoder` and fit it.

    :param features: List of features to encode.
    :param prefix: Prefix for the names of the encoded features.
    '''
    features = features or self.get_column_names()
    features = _ensure_strings_from_expressions(features)
    label_encoder = LabelEncoder(features=features, prefix=prefix)
    label_encoder.fit(self)
    return label_encoder
Example #9
0
def iter_vaex(X, y=None, features=None, **kwargs):
    """Yields rows from a ``vaex.DataFrame``.

    Parameters:
        X (vaex.DataFrame): A vaex DataFrame housing the training featuers.
        y (string or vaex.Expression): The column or expression containing the target variable.
        features (list of strings or vaex.Expressions): A list of features used for training.
        If None, all columns in ``X`` will be used. Features specifying in ``y`` are ignored.

    Yields:
        tuple: A pair (``x``, ``y``) where ``x`` is a dict of features and ``y`` is the target.

    """

    from vaex.utils import _ensure_strings_from_expressions, _ensure_list

    features = _ensure_strings_from_expressions(features)
    feature_names = features or X.get_column_names()

    if y:
        y = _ensure_strings_from_expressions(y)
        y = _ensure_list(y)
        feature_names = [feat for feat in feature_names if feat not in y]

    multioutput = len(y) > 1

    if multioutput:
        for i in range(len(X)):
            yield ({
                key: X.evaluate(key, i, i + 1)[0]
                for key in feature_names
            }, {key: X.evaluate(key, i, i + 1)[0]
                for key in y})

    else:

        for i in range(len(X)):
            yield ({
                key: X.evaluate(key, i, i + 1)[0]
                for key in feature_names
            }, X.evaluate(y[0], i, i + 1)[0])
Example #10
0
    def minmax_scaler(self, features=None, feature_range=[0, 1], prefix='minmax_scaled_'):
        '''Requires vaex.ml: Create :class:`vaex.ml.transformations.MinMaxScaler` and fit it.

        :param features: List of features to scale.
        :param feature_range: The range the features are scaled to.
        :param prefix: Prefix for the names of the scaled features.
        '''
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        minmax_scaler = MinMaxScaler(features=features, feature_range=feature_range, prefix=prefix)
        minmax_scaler.fit(self.df)
        return minmax_scaler
Example #11
0
    def label_encoder(self, features=None, prefix='label_encoded_', allow_unseen=False):
        '''Requires vaex.ml: Create :class:`vaex.ml.transformations.LabelEncoder` and fit it.

        :param features: List of features to encode.
        :param prefix: Prefix for the names of the encoded features.
        :param allow_unseen: If True, encode unseen value as -1, otherwise an error is raised.
        '''
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        label_encoder = LabelEncoder(features=features, prefix=prefix, allow_unseen=allow_unseen)
        label_encoder.fit(self.df)
        return label_encoder
Example #12
0
    def standard_scaler(self, features=None, with_mean=True, with_std=True, prefix='standard_scaled_'):
        '''Requires vaex.ml: Create :class:`vaex.ml.transformations.StandardScaler` and fit it.

        :param features: List of features to scale.
        :param with_mean: If True, remove the mean from each feature.
        :param with_std: If True, scale each feature to unit variance.
        :param prefix: Prefix for the names of the scaled features.
        '''
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        standard_scaler = StandardScaler(features=features, with_mean=with_mean, with_std=with_std, prefix=prefix)
        standard_scaler.fit(self.df)
        return standard_scaler
Example #13
0
    def pca(self, n_components=2, features=None, prefix='PCA_', progress=False):
        '''Requires vaex.ml: Create :class:`vaex.ml.transformations.PCA` and fit it.

        :param n_components: Number of components to retain. If None, all the components will be retained.
        :param features: List of features to transform.
        :param prefix: Prefix for the names of the transformed features.
        :param progress: If True, display a progressbar of the PCA fitting process.
        '''
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        pca = PCA(n_components=n_components, features=features, progress=progress)
        pca.fit(self.df)
        return pca
Example #14
0
    def frequency_encoder(self, features=None, unseen='nan', prefix='frequency_encoded_'):
        '''
        Requires vaex.ml: Create :class:`vaex.ml.transformations.FrequencyEncoder` and fit it.

        :param features: List of features to encode.
        :param unseen: Strategy to deal with unseen values. Accepted arguments are "zero" or "nan".
        :param prefix: Prefix for the names of the encoded features.
        '''
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        freq_encoder = FrequencyEncoder(features=features, prefix=prefix)
        freq_encoder.fit(self.df)
        return freq_encoder
Example #15
0
    def fit(self, df):
        '''Fit OneHotEncoder to the DataFrame.

        :param df: A vaex DataFrame.
        '''

        uniques = []
        for i in self.features:
            expression = _ensure_strings_from_expressions(i)
            unique = df.unique(expression)
            unique = np.sort(unique)  # this can/should be optimized with @delay
            uniques.append(unique.tolist())
        self.uniques_ = uniques
Example #16
0
def test_py_gbm_virtual_columns():
    ds = vaex.ml.datasets.load_iris()
    ds['x'] = ds.sepal_length * 1
    ds['y'] = ds.sepal_width * 1
    ds['w'] = ds.petal_length * 1
    ds['z'] = ds.petal_width * 1
    ds_train, ds_test = ds.ml.train_test_split(test_size=0.2, verbose=False)
    features = ['x', 'y', 'z', 'w']
    booster = vaex.ml.incubator.pygbm.PyGBMModel(
        num_round=10,
        param=param,
        features=_ensure_strings_from_expressions(features))
    booster.fit(ds_train, ds_train.class_)
Example #17
0
    def one_hot_encoder(self, features=None, one=1, zero=0, prefix=''):
        '''Requires vaex.ml: Create :class:`vaex.ml.transformations.OneHotEncoder` and fit it.

        :param features: List of features to encode.
        :param one: What value to use instead of "1".
        :param zero: What value to use instead of "0".
        :param prefix: Prefix for the names of the encoded features.
        :returns one_hot_encoder: vaex.ml.transformations.OneHotEncoder object.
        '''
        if features is None:
            raise ValueError('Please give at least one categorical feature.')
        features = _ensure_strings_from_expressions(features)
        one_hot_encoder = OneHotEncoder(features=features, one=one, zero=zero, prefix=prefix)
        one_hot_encoder.fit(self.df)
        return one_hot_encoder
Example #18
0
def test_lightgbm():
    ds = vaex.ml.datasets.load_iris()
    ds_train, ds_test = ds.ml.train_test_split(test_size=0.2, verbose=False)
    features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    features = _ensure_strings_from_expressions(features)
    booster = vaex.ml.lightgbm.LightGBMModel(num_boost_round=10, params=params, features=features, target='class_')

    booster.fit(ds_train, copy=True)    # for coverage
    class_predict_train = booster.predict(ds_train, copy=True)  # for coverage
    class_predict_test = booster.predict(ds_test)
    assert np.all(ds_test.col.class_.values == np.argmax(class_predict_test, axis=1))

    ds_train = booster.transform(ds_train)   # this will add the lightgbm_prediction column
    state = ds_train.state_get()
    ds_test.state_set(state)
    assert np.all(ds_test.col.class_.values == np.argmax(ds_test.lightgbm_prediction.values, axis=1))
Example #19
0
    def expand(self, stop=[]):
        """Expand the expression such that no virtual columns occurs, only normal columns.

        Example:

        >>> df = vaex.example()
        >>> r = np.sqrt(df.data.x**2 + df.data.y**2)
        >>> r.expand().expression
        'sqrt(((x ** 2) + (y ** 2)))'

        """
        stop = _ensure_strings_from_expressions(stop)
        def translate(id):
            if id in self.ds.virtual_columns and id not in stop:
                return self.ds.virtual_columns[id]
        expr = expresso.translate(self.ast, translate)
        return Expression(self.ds, expr)
Example #20
0
                    def wrapper(self, features=None, transform=True, **kwargs):
                        kwargs = kwargs.copy(
                        )  # we do modifications, so make a copy
                        features = features or self.df.get_column_names()
                        features = _ensure_strings_from_expressions(features)
                        import importlib
                        module = importlib.import_module(class_spec['module'])
                        cls = getattr(module, class_spec['classname'])
                        if 'target' in kwargs:
                            kwargs['target'] = str(kwargs['target'])

                        object = cls(features=features, **kwargs)
                        object.fit(self.df)
                        if transform:
                            dft = object.transform(self.df)
                            return dft
                        else:
                            return object
Example #21
0
 def evaluate(self,
              expression,
              i1=None,
              i2=None,
              out=None,
              selection=None,
              delay=False):
     expression = _ensure_strings_from_expressions(expression)
     """basic support for evaluate at server, at least to run some unittest, do not expect this to work from strings"""
     result = self.server._call_dataset("evaluate",
                                        self,
                                        expression=expression,
                                        i1=i1,
                                        i2=i2,
                                        selection=selection,
                                        delay=delay)
     # TODO: we ignore out
     return result
Example #22
0
def test_lightgbm():
    ds = vaex.ml.datasets.load_iris()
    ds_train, ds_test = ds.ml.train_test_split(test_size=0.2, verbose=False)
    features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    booster = vaex.ml.lightgbm.LightGBMModel(
        num_boost_round=10,
        params=params,
        features=_ensure_strings_from_expressions(features))
    booster.fit(ds_train, ds.class_, copy=True)  # for coverage
    class_predict = booster.predict(ds_test, copy=True)  # for coverage
    booster.fit(ds_train, ds.class_)
    class_predict = booster.predict(ds_test)
    assert np.all(ds.col.class_ == class_predict)

    ds = booster.transform(ds)  # this will add the lightgbm_prediction column
    state = ds.state_get()
    ds = vaex.ml.datasets.load_iris()
    ds.state_set(state)
    assert np.all(ds.col.class_ == ds.evaluate(ds.lightgbm_prediction))
Example #23
0
    def fit(self, df):
        '''Fit OneHotEncoder to the DataFrame.

        :param df: A vaex DataFrame.
        '''

        uniques = []
        for i in self.features:
            expression = _ensure_strings_from_expressions(i)
            unique_values = vaex.array_types.tolist(df.unique(expression))

            if None in unique_values:
                unique_values.remove(None)
                unique_values.sort()
                unique_values.insert(0, None)  # This is done in place
            else:
                unique_values.sort()
            uniques.append(unique_values)
        self.uniques_ = uniques
Example #24
0
def test_pygbm():
    for filename in 'blah.col.meta blah.col.page'.split():
        if os.path.exists(filename):
            os.remove(filename)
    ds = vaex.ml.datasets.load_iris()
    ds_train, ds_test = ds.ml.train_test_split(test_size=0.2, verbose=False)
    features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    booster = vaex.ml.incubator.pygbm.PyGBMModel(
        num_round=10,
        param=param,
        features=_ensure_strings_from_expressions(features))
    booster.fit(ds_train, ds_train.class_)
    class_predict = booster.predict(ds_test)
    booster.fit(ds_train, ds_train.class_)
    class_predict = booster.predict(ds_test)
    assert np.all(ds.col.class_ == class_predict)

    ds = booster.transform(ds)  # this will add the pygbm_prediction column
    state = ds.state_get()
    ds = vaex.ml.datasets.load_iris()
    ds.state_set(state)
    assert np.all(ds.col.class_ == ds.evaluate(ds.pygbm_prediction))
Example #25
0
def xgboost_model(self,
                  target,
                  num_boost_round,
                  features=None,
                  params={},
                  prediction_name='xgboost_prediction'):
    '''Requires vaex.ml: create a XGBoost model and train/fit it.

    :param target: Target to train/fit on.
    :param num_boost_round: Number of rounds.
    :param features: List of features to train on.
    :return vaex.ml.xgboost.XGBModel: Fitted XGBoost model.
    '''
    from .xgboost import XGBoostModel
    dataframe = self
    features = features or self.get_column_names()
    features = _ensure_strings_from_expressions(features)
    booster = XGBoostModel(prediction_name=prediction_name,
                           num_boost_round=num_boost_round,
                           features=features,
                           params=params)
    booster.fit(dataframe, target)
    return booster
Example #26
0
    def catboost_model(self,
                       target,
                       num_boost_round,
                       features=None,
                       params=None,
                       prediction_name='catboost_prediction'):
        '''Requires vaex.ml: create a CatBoostModel model and train/fit it.

        :param target: Target to train/fit on
        :param num_boost_round: Number of rounds
        :param features: List of features to train on
        :return vaex.ml.catboost.CatBoostModel: Fitted CatBoostModel model
        '''
        from .catboost import CatBoostModel
        dataframe = self.df
        features = features or self.df.get_column_names()
        features = _ensure_strings_from_expressions(features)
        booster = CatBoostModel(prediction_name=prediction_name,
                                num_boost_round=num_boost_round,
                                features=features,
                                params=params)
        booster.fit(dataframe, target)
        return booster
def plot(
        self,
        x=None,
        y=None,
        z=None,
        what="count(*)",
        vwhat=None,
        reduce=["colormap"],
        f=None,
        normalize="normalize",
        normalize_axis="what",
        vmin=None,
        vmax=None,
        shape=256,
        vshape=32,
        limits=None,
        grid=None,
        colormap="afmhot",  # colors=["red", "green", "blue"],
        figsize=None,
        xlabel=None,
        ylabel=None,
        aspect="auto",
        tight_layout=True,
        interpolation="nearest",
        show=False,
        colorbar=True,
        colorbar_label=None,
        selection=None,
        selection_labels=None,
        title=None,
        background_color="white",
        pre_blend=False,
        background_alpha=1.,
        visual=dict(x="x",
                    y="y",
                    layer="z",
                    fade="selection",
                    row="subspace",
                    column="what"),
        smooth_pre=None,
        smooth_post=None,
        wrap=True,
        wrap_columns=4,
        return_extra=False,
        hardcopy=None):
    """Viz data in a 2d histogram/heatmap.

    Declarative plotting of statistical plots using matplotlib, supports subplots, selections, layers.

    Instead of passing x and y, pass a list as x argument for multiple panels. Give what a list of options to have multiple
    panels. When both are present then will be origanized in a column/row order.

    This methods creates a 6 dimensional 'grid', where each dimension can map the a visual dimension.
    The grid dimensions are:

     * x: shape determined by shape, content by x argument or the first dimension of each space
     * y:   ,,
     * z:  related to the z argument
     * selection: shape equals length of selection argument
     * what: shape equals length of what argument
     * space: shape equals length of x argument if multiple values are given

     By default, this its shape is (1, 1, 1, 1, shape, shape) (where x is the last dimension)

    The visual dimensions are

     * x: x coordinate on a plot / image (default maps to grid's x)
     * y: y   ,,                         (default maps to grid's y)
     * layer: each image in this dimension is blended togeher to one image (default maps to z)
     * fade: each image is shown faded after the next image (default mapt to selection)
     * row: rows of subplots (default maps to space)
     * columns: columns of subplot (default maps to what)

    All these mappings can be changes by the visual argument, some examples:

    >>> df.plot('x', 'y', what=['mean(x)', 'correlation(vx, vy)'])

    Will plot each 'what' as a column.

    >>> df.plot('x', 'y', selection=['FeH < -3', '(FeH >= -3) & (FeH < -2)'], visual=dict(column='selection'))

    Will plot each selection as a column, instead of a faded on top of each other.





    :param x: Expression to bin in the x direction (by default maps to x), or list of pairs, like [['x', 'y'], ['x', 'z']], if multiple pairs are given, this dimension maps to rows by default
    :param y:                          y           (by default maps to y)
    :param z: Expression to bin in the z direction, followed by a :start,end,shape  signature, like 'FeH:-3,1:5' will produce 5 layers between -10 and 10 (by default maps to layer)
    :param what: What to plot, count(*) will show a N-d histogram, mean('x'), the mean of the x column, sum('x') the sum, std('x') the standard deviation, correlation('vx', 'vy') the correlation coefficient. Can also be a list of values, like ['count(x)', std('vx')], (by default maps to column)
    :param reduce:
    :param f: transform values by: 'identity' does nothing 'log' or 'log10' will show the log of the value
    :param normalize: normalization function, currently only 'normalize' is supported
    :param normalize_axis: which axes to normalize on, None means normalize by the global maximum.
    :param vmin: instead of automatic normalization, (using normalize and normalization_axis) scale the data between vmin and vmax to [0, 1]
    :param vmax: see vmin
    :param shape: shape/size of the n-D histogram grid
    :param limits: list of [[xmin, xmax], [ymin, ymax]], or a description such as 'minmax', '99%'
    :param grid: if the binning is done before by yourself, you can pass it
    :param colormap: matplotlib colormap to use
    :param figsize: (x, y) tuple passed to pylab.figure for setting the figure size
    :param xlabel:
    :param ylabel:
    :param aspect:
    :param tight_layout: call pylab.tight_layout or not
    :param colorbar: plot a colorbar or not
    :param interpolation: interpolation for imshow, possible options are: 'nearest', 'bilinear', 'bicubic', see matplotlib for more
    :param return_extra:
    :return:
    """
    import pylab
    import matplotlib
    n = _parse_n(normalize)
    if type(shape) == int:
        shape = (shape, ) * 2
    binby = []
    x = _ensure_strings_from_expressions(x)
    y = _ensure_strings_from_expressions(y)
    for expression in [y, x]:
        if expression is not None:
            binby = [expression] + binby
    fig = pylab.gcf()
    if figsize is not None:
        fig.set_size_inches(*figsize)
    import re

    what_units = None
    whats = _ensure_list(what)
    selections = _ensure_list(selection)
    selections = _ensure_strings_from_expressions(selections)

    if y is None:
        waslist, [
            x,
        ] = vaex.utils.listify(x)
    else:
        waslist, [x, y] = vaex.utils.listify(x, y)
        x = list(zip(x, y))
        limits = [limits]

    # every plot has its own vwhat for now
    vwhats = _expand_limits(vwhat,
                            len(x))  # TODO: we're abusing this function..
    logger.debug("x: %s", x)
    limits, shape = self.limits(x, limits, shape=shape)
    shape = shape[0]
    logger.debug("limits: %r", limits)

    # mapping of a grid axis to a label
    labels = {}
    shape = _expand_shape(shape, 2)
    vshape = _expand_shape(shape, 2)
    if z is not None:
        match = re.match("(.*):(.*),(.*),(.*)", z)
        if match:
            groups = match.groups()
            import ast
            z_expression = groups[0]
            logger.debug("found groups: %r", list(groups))
            z_limits = [
                ast.literal_eval(groups[1]),
                ast.literal_eval(groups[2])
            ]
            z_shape = ast.literal_eval(groups[3])
            # for pair in x:
            x = [[z_expression] + list(k) for k in x]
            limits = np.array([[z_limits] + list(k) for k in limits])
            shape = (z_shape, ) + shape
            vshape = (z_shape, ) + vshape
            logger.debug("x = %r", x)
            values = np.linspace(z_limits[0], z_limits[1], num=z_shape + 1)
            labels["z"] = list([
                "%s <= %s < %s" % (v1, z_expression, v2)
                for v1, v2 in zip(values[:-1], values[1:])
            ])
        else:
            raise ValueError(
                "Could not understand 'z' argument %r, expected something in form: 'column:-1,10:5'"
                % facet)
    else:
        z_shape = 1

    # z == 1
    if z is None:
        total_grid = np.zeros((len(x), len(whats), len(selections), 1) + shape,
                              dtype=float)
        total_vgrid = np.zeros(
            (len(x), len(whats), len(selections), 1) + vshape, dtype=float)
    else:
        total_grid = np.zeros((len(x), len(whats), len(selections)) + shape,
                              dtype=float)
        total_vgrid = np.zeros((len(x), len(whats), len(selections)) + vshape,
                               dtype=float)
    logger.debug("shape of total grid: %r", total_grid.shape)
    axis = dict(plot=0, what=1, selection=2)
    xlimits = limits

    grid_axes = dict(x=-1, y=-2, z=-3, selection=-4, what=-5, subspace=-6)
    visual_axes = dict(x=-1, y=-2, layer=-3, fade=-4, column=-5, row=-6)
    # visual_default=dict(x="x", y="y", z="layer", selection="fade", subspace="row", what="column")
    # visual: mapping of a plot axis, to a grid axis
    visual_default = dict(x="x",
                          y="y",
                          layer="z",
                          fade="selection",
                          row="subspace",
                          column="what")

    def invert(x):
        return dict((v, k) for k, v in x.items())

    # visual_default_reverse = invert(visual_default)
    # visual_ = visual_default
    # visual = dict(visual) # copy for modification
    # add entries to avoid mapping multiple times to the same axis
    free_visual_axes = list(visual_default.keys())
    # visual_reverse = invert(visual)
    logger.debug("1: %r %r", visual, free_visual_axes)
    for visual_name, grid_name in visual.items():
        if visual_name in free_visual_axes:
            free_visual_axes.remove(visual_name)
        else:
            raise ValueError("visual axes %s used multiple times" %
                             visual_name)
    logger.debug("2: %r %r", visual, free_visual_axes)
    for visual_name, grid_name in visual_default.items():
        if visual_name in free_visual_axes and grid_name not in visual.values(
        ):
            free_visual_axes.remove(visual_name)
            visual[visual_name] = grid_name
    logger.debug("3: %r %r", visual, free_visual_axes)
    for visual_name, grid_name in visual_default.items():
        if visual_name not in free_visual_axes and grid_name not in visual.values(
        ):
            visual[free_visual_axes.pop(0)] = grid_name

    logger.debug("4: %r %r", visual, free_visual_axes)

    visual_reverse = invert(visual)
    # TODO: the meaning of visual and visual_reverse is changed below this line, super confusing
    visual, visual_reverse = visual_reverse, visual
    # so now, visual: mapping of a grid axis to plot axis
    # visual_reverse: mapping of a grid axis to plot axis
    move = {}
    for grid_name, visual_name in visual.items():
        if visual_axes[visual_name] in visual.values():
            index = visual.values().find(visual_name)
            key = visual.keys()[index]
            raise ValueError(
                "trying to map %s to %s while, it is already mapped by %s" %
                (grid_name, visual_name, key))
        move[grid_axes[grid_name]] = visual_axes[visual_name]

    # normalize_axis = _ensure_list(normalize_axis)

    fs = _expand(f, total_grid.shape[grid_axes[normalize_axis]])
    # assert len(vwhat)
    # labels["y"] = ylabels
    what_labels = []
    if grid is None:
        grid_of_grids = []
        for i, (binby, limits) in enumerate(zip(x, xlimits)):
            grid_of_grids.append([])
            for j, what in enumerate(whats):
                if isinstance(what, vaex.stat.Expression):
                    grid = what.calculate(self,
                                          binby=binby,
                                          shape=shape,
                                          limits=limits,
                                          selection=selections,
                                          delay=True)
                else:
                    what = what.strip()
                    index = what.index("(")
                    import re
                    groups = re.match("(.*)\((.*)\)", what).groups()
                    if groups and len(groups) == 2:
                        function = groups[0]
                        arguments = groups[1].strip()
                        if "," in arguments:
                            arguments = arguments.split(",")
                        functions = [
                            "mean", "sum", "std", "var", "correlation",
                            "covar", "min", "max", "median_approx"
                        ]
                        unit_expression = None
                        if function in [
                                "mean", "sum", "std", "min", "max", "median"
                        ]:
                            unit_expression = arguments
                        if function in ["var"]:
                            unit_expression = "(%s) * (%s)" % (arguments,
                                                               arguments)
                        if function in ["covar"]:
                            unit_expression = "(%s) * (%s)" % arguments
                        if unit_expression:
                            unit = self.unit(unit_expression)
                            if unit:
                                what_units = unit.to_string('latex_inline')
                        if function in functions:
                            grid = getattr(self,
                                           function)(arguments,
                                                     binby=binby,
                                                     limits=limits,
                                                     shape=shape,
                                                     selection=selections,
                                                     delay=True)
                        elif function == "count":
                            grid = self.count(arguments,
                                              binby,
                                              shape=shape,
                                              limits=limits,
                                              selection=selections,
                                              delay=True)
                        else:
                            raise ValueError(
                                "Could not understand method: %s, expected one of %r'"
                                % (function, functions))
                    else:
                        raise ValueError(
                            "Could not understand 'what' argument %r, expected something in form: 'count(*)', 'mean(x)'"
                            % what)
                if i == 0:  # and j == 0:
                    what_label = str(whats[j])
                    if what_units:
                        what_label += " (%s)" % what_units
                    if fs[j]:
                        what_label = fs[j] + " " + what_label
                    what_labels.append(what_label)
                grid_of_grids[-1].append(grid)
        self.executor.execute()
        for i, (binby, limits) in enumerate(zip(x, xlimits)):
            for j, what in enumerate(whats):
                grid = grid_of_grids[i][j].get()
                total_grid[i, j, :, :] = grid[:, None, ...]
        labels["what"] = what_labels
    else:
        dims_left = 6 - len(grid.shape)
        total_grid = np.broadcast_to(grid, (1, ) * dims_left + grid.shape)

    #           visual=dict(x="x", y="y", selection="fade", subspace="facet1", what="facet2",)
    def _selection_name(name):
        if name in [None, False]:
            return "selection: all"
        elif name in ["default", True]:
            return "selection: default"
        else:
            return "selection: %s" % name

    if selection_labels is None:
        labels["selection"] = list([_selection_name(k) for k in selections])
    else:
        labels["selection"] = selection_labels

    # visual_grid = np.moveaxis(total_grid, move.keys(), move.values())
    # np.moveaxis is in np 1.11 only?, use transpose
    axes = [None] * len(move)
    for key, value in move.items():
        axes[value] = key
    visual_grid = np.transpose(total_grid, axes)

    logger.debug("grid shape: %r", total_grid.shape)
    logger.debug("visual: %r", visual.items())
    logger.debug("move: %r", move)
    logger.debug("visual grid shape: %r", visual_grid.shape)

    xexpressions = []
    yexpressions = []
    for i, (binby, limits) in enumerate(zip(x, xlimits)):
        xexpressions.append(binby[0])
        yexpressions.append(binby[1])
    if xlabel is None:
        xlabels = []
        ylabels = []
        for i, (binby, limits) in enumerate(zip(x, xlimits)):
            if z is not None:
                xlabels.append(self.label(binby[1]))
                ylabels.append(self.label(binby[2]))
            else:
                xlabels.append(self.label(binby[0]))
                ylabels.append(self.label(binby[1]))
    else:
        Nl = visual_grid.shape[visual_axes['row']]
        xlabels = _expand(xlabel, Nl)
        ylabels = _expand(ylabel, Nl)
    #labels[visual["x"]] = (xlabels, ylabels)
    labels["x"] = xlabels
    labels["y"] = ylabels

    # grid = total_grid
    # print(grid.shape)
    # grid = self.reduce(grid, )
    axes = []
    # cax = pylab.subplot(1,1,1)

    background_color = np.array(
        matplotlib.colors.colorConverter.to_rgb(background_color))

    # if grid.shape[axis["selection"]] > 1:#  and not facet:
    #   rgrid = vaex.image.fade(rgrid)
    #   finite_mask = np.any(finite_mask, axis=0) # do we really need this
    #   print(rgrid.shape)
    # facet_row_axis = axis["what"]
    import math
    facet_columns = None
    facets = visual_grid.shape[visual_axes["row"]] * visual_grid.shape[
        visual_axes["column"]]
    if visual_grid.shape[visual_axes["column"]] == 1 and wrap:
        facet_columns = min(wrap_columns,
                            visual_grid.shape[visual_axes["row"]])
        wrapped = True
    elif visual_grid.shape[visual_axes["row"]] == 1 and wrap:
        facet_columns = min(wrap_columns,
                            visual_grid.shape[visual_axes["column"]])
        wrapped = True
    else:
        wrapped = False
        facet_columns = visual_grid.shape[visual_axes["column"]]
    facet_rows = int(math.ceil(facets / facet_columns))
    logger.debug("facet_rows: %r", facet_rows)
    logger.debug("facet_columns: %r", facet_columns)
    # if visual_grid.shape[visual_axes["row"]] > 1: # and not wrap:
    #   #facet_row_axis = axis["what"]
    #   facet_columns = visual_grid.shape[visual_axes["column"]]
    # else:
    #   facet_columns = min(wrap_columns, facets)
    # if grid.shape[axis["plot"]] > 1:#  and not facet:

    # this loop could be done using axis arguments everywhere
    # assert len(normalize_axis) == 1, "currently only 1 normalization axis supported"
    grid = visual_grid * 1.
    fgrid = visual_grid * 1.
    ngrid = visual_grid * 1.
    # colorgrid = np.zeros(ngrid.shape + (4,), float)
    # print "norma", normalize_axis, visual_grid.shape[visual_axes[visual[normalize_axis]]]
    vmins = _expand(vmin,
                    visual_grid.shape[visual_axes[visual[normalize_axis]]],
                    type=list)
    vmaxs = _expand(vmax,
                    visual_grid.shape[visual_axes[visual[normalize_axis]]],
                    type=list)
    # for name in normalize_axis:
    visual_grid
    if smooth_pre:
        grid = vaex.grids.gf(grid, smooth_pre)
    if 1:
        axis = visual_axes[visual[normalize_axis]]
        for i in range(visual_grid.shape[axis]):
            item = [
                slice(None, None, None),
            ] * len(visual_grid.shape)
            item[axis] = i
            item = tuple(item)
            f = _parse_f(fs[i])
            with np.errstate(divide='ignore', invalid='ignore'
                             ):  # these are fine, we are ok with nan's in vaex
                fgrid.__setitem__(item, f(grid.__getitem__(item)))
            # print vmins[i], vmaxs[i]
            if vmins[i] is not None and vmaxs[i] is not None:
                nsubgrid = fgrid.__getitem__(item) * 1
                nsubgrid -= vmins[i]
                nsubgrid /= (vmaxs[i] - vmins[i])
            else:
                nsubgrid, vmin, vmax = n(fgrid.__getitem__(item))
                vmins[i] = vmin
                vmaxs[i] = vmax
            # print "    ", vmins[i], vmaxs[i]
            ngrid.__setitem__(item, nsubgrid)

    if 0:  # TODO: above should be like the code below, with custom vmin and vmax
        grid = visual_grid[i]
        f = _parse_f(fs[i])
        fgrid = f(grid)
        finite_mask = np.isfinite(grid)
        finite_mask = np.any(finite_mask, axis=0)
        if vmin is not None and vmax is not None:
            ngrid = fgrid * 1
            ngrid -= vmin
            ngrid /= (vmax - vmin)
            ngrid = np.clip(ngrid, 0, 1)
        else:
            ngrid, vmin, vmax = n(fgrid)
            # vmin, vmax = np.nanmin(fgrid), np.nanmax(fgrid)
    # every 'what', should have its own colorbar, check if what corresponds to
    # rows or columns in facets, if so, do a colorbar per row or per column

    rows, columns = int(math.ceil(facets /
                                  float(facet_columns))), facet_columns
    colorbar_location = "individual"
    if visual["what"] == "row" and visual_grid.shape[1] == facet_columns:
        colorbar_location = "per_row"
    if visual["what"] == "column" and visual_grid.shape[0] == facet_rows:
        colorbar_location = "per_column"
    # values = np.linspace(facet_limits[0], facet_limits[1], facet_count+1)
    logger.debug("rows: %r, columns: %r", rows, columns)
    import matplotlib.gridspec as gridspec
    column_scale = 1
    row_scale = 1
    row_offset = 0
    if facets > 1:
        if colorbar_location == "per_row":
            column_scale = 4
            gs = gridspec.GridSpec(rows, columns * column_scale + 1)
        elif colorbar_location == "per_column":
            row_offset = 1
            row_scale = 4
            gs = gridspec.GridSpec(rows * row_scale + 1, columns)
        else:
            gs = gridspec.GridSpec(rows, columns)
    facet_index = 0
    fs = _expand(f, len(whats))
    colormaps = _expand(colormap, len(whats))

    # row
    for i in range(visual_grid.shape[0]):
        # column
        for j in range(visual_grid.shape[1]):
            if colorbar and colorbar_location == "per_column" and i == 0:
                norm = matplotlib.colors.Normalize(vmins[j], vmaxs[j])
                sm = matplotlib.cm.ScalarMappable(norm, colormaps[j])
                sm.set_array(1)  # make matplotlib happy (strange behavious)
                if facets > 1:
                    ax = pylab.subplot(gs[0, j])
                    colorbar = fig.colorbar(sm,
                                            cax=ax,
                                            orientation="horizontal")
                else:
                    colorbar = fig.colorbar(sm)
                if "what" in labels:
                    label = labels["what"][j]
                    if facets > 1:
                        colorbar.ax.set_title(label)
                    else:
                        colorbar.ax.set_ylabel(colorbar_label or label)

            if colorbar and colorbar_location == "per_row" and j == 0:
                norm = matplotlib.colors.Normalize(vmins[i], vmaxs[i])
                sm = matplotlib.cm.ScalarMappable(norm, colormaps[i])
                sm.set_array(1)  # make matplotlib happy (strange behavious)
                if facets > 1:
                    ax = pylab.subplot(gs[i, -1])
                    colorbar = fig.colorbar(sm, cax=ax)
                else:
                    colorbar = fig.colorbar(sm)
                label = labels["what"][i]
                colorbar.ax.set_ylabel(colorbar_label or label)

            rgrid = ngrid[i, j] * 1.
            # print rgrid.shape
            for k in range(rgrid.shape[0]):
                for l in range(rgrid.shape[0]):
                    if smooth_post is not None:
                        rgrid[k, l] = vaex.grids.gf(rgrid, smooth_post)
            if visual["what"] == "column":
                what_index = j
            elif visual["what"] == "row":
                what_index = i
            else:
                what_index = 0

            if visual[normalize_axis] == "column":
                normalize_index = j
            elif visual[normalize_axis] == "row":
                normalize_index = i
            else:
                normalize_index = 0
            for r in reduce:
                r = _parse_reduction(r, colormaps[what_index], [])
                rgrid = r(rgrid)

            row = facet_index // facet_columns
            column = facet_index % facet_columns

            if colorbar and colorbar_location == "individual":
                # visual_grid.shape[visual_axes[visual[normalize_axis]]]
                norm = matplotlib.colors.Normalize(vmins[normalize_index],
                                                   vmaxs[normalize_index])
                sm = matplotlib.cm.ScalarMappable(norm, colormaps[what_index])
                sm.set_array(1)  # make matplotlib happy (strange behavious)
                if facets > 1:
                    ax = pylab.subplot(gs[row, column])
                    colorbar = fig.colorbar(sm, ax=ax)
                else:
                    colorbar = fig.colorbar(sm)
                label = labels["what"][what_index]
                colorbar.ax.set_ylabel(colorbar_label or label)

            if facets > 1:
                ax = pylab.subplot(
                    gs[row_offset + row * row_scale:row_offset +
                       (row + 1) * row_scale,
                       column * column_scale:(column + 1) * column_scale])
            else:
                ax = pylab.gca()
            axes.append(ax)
            logger.debug("rgrid: %r", rgrid.shape)
            plot_rgrid = rgrid
            assert plot_rgrid.shape[1] == 1, "no layers supported yet"
            plot_rgrid = plot_rgrid[:, 0]
            if plot_rgrid.shape[0] > 1:
                plot_rgrid = vaex.image.fade(plot_rgrid[::-1])
            else:
                plot_rgrid = plot_rgrid[0]
            extend = None
            if visual["subspace"] == "row":
                subplot_index = i
            elif visual["subspace"] == "column":
                subplot_index = j
            else:
                subplot_index = 0
            extend = np.array(xlimits[subplot_index][-2:]).flatten()
            #   extend = np.array(xlimits[i]).flatten()
            logger.debug("plot rgrid: %r", plot_rgrid.shape)
            plot_rgrid = np.transpose(plot_rgrid, (1, 0, 2))
            im = ax.imshow(plot_rgrid,
                           extent=extend.tolist(),
                           origin="lower",
                           aspect=aspect,
                           interpolation=interpolation)

            # v1, v2 = values[i], values[i+1]

            def label(index, label, expression):
                if label and _issequence(label):
                    return label[i]
                else:
                    return self.label(expression)

            if visual_reverse["x"] == 'x':
                labelsx = labels['x']
                pylab.xlabel(labelsx[subplot_index])
            if visual_reverse["x"] == 'x':
                labelsy = labels['y']
                pylab.ylabel(labelsy[subplot_index])
            if visual["z"] in ['row']:
                labelsz = labels['z']
                ax.set_title(labelsz[i])
            if visual["z"] in ['column']:
                labelsz = labels['z']
                ax.set_title(labelsz[j])

            max_labels = 10
            xexpression = xexpressions[i]
            if self.iscategory(xexpression):
                labels = self.category_labels(xexpression)
                step = max(len(labels) // max_labels, 1)
                pylab.xticks(np.arange(len(labels))[::step],
                             labels[::step],
                             size='small')
            yexpression = yexpressions[i]
            if self.iscategory(yexpression):
                labels = self.category_labels(yexpression)
                step = max(len(labels) // max_labels, 1)
                pylab.yticks(np.arange(len(labels))[::step],
                             labels[::step],
                             size='small')
            facet_index += 1
    if title:
        fig.suptitle(title, fontsize="x-large")
    if tight_layout:
        if title:
            pylab.tight_layout(rect=[0, 0.03, 1, 0.95])
        else:
            pylab.tight_layout()
    if hardcopy:
        pylab.savefig(hardcopy)
    if show:
        pylab.show()
    if return_extra:
        return im, grid, fgrid, ngrid, rgrid
    else:
        return im
def plot1d(self,
           x=None,
           what="count(*)",
           grid=None,
           shape=64,
           facet=None,
           limits=None,
           figsize=None,
           f="identity",
           n=None,
           normalize_axis=None,
           xlabel=None,
           ylabel=None,
           label=None,
           selection=None,
           show=False,
           tight_layout=True,
           hardcopy=None,
           progress=None,
           **kwargs):
    """Viz data in 1d (histograms, running means etc)

    Example

    >>> df.plot1d(df.x)
    >>> df.plot1d(df.x, limits=[0, 100], shape=100)
    >>> df.plot1d(df.x, what='mean(y)', limits=[0, 100], shape=100)

    If you want to do a computation yourself, pass the grid argument, but you are responsible for passing the
    same limits arguments:

    >>> counts = df.mean(df.y, binby=df.x, limits=[0, 100], shape=100)/100.
    >>> df.plot1d(df.x, limits=[0, 100], shape=100, grid=means, label='mean(y)/100')

    :param x: Expression to bin in the x direction
    :param what: What to plot, count(*) will show a N-d histogram, mean('x'), the mean of the x column, sum('x') the sum
    :param grid: If the binning is done before by yourself, you can pass it
    :param facet: Expression to produce facetted plots ( facet='x:0,1,12' will produce 12 plots with x in a range between 0 and 1)
    :param limits: list of [xmin, xmax], or a description such as 'minmax', '99%'
    :param figsize: (x, y) tuple passed to pylab.figure for setting the figure size
    :param f: transform values by: 'identity' does nothing 'log' or 'log10' will show the log of the value
    :param n: normalization function, currently only 'normalize' is supported, or None for no normalization
    :param normalize_axis: which axes to normalize on, None means normalize by the global maximum.
    :param normalize_axis:
    :param xlabel: String for label on x axis (may contain latex)
    :param ylabel: Same for y axis
    :param: tight_layout: call pylab.tight_layout or not
    :param kwargs: extra argument passed to pylab.plot
    :return:
    """

    import pylab
    f = _parse_f(f)
    n = _parse_n(n)
    if type(shape) == int:
        shape = (shape, )
    binby = []
    x = _ensure_strings_from_expressions(x)
    for expression in [x]:
        if expression is not None:
            binby = [expression] + binby
    limits = self.limits(binby, limits)
    if figsize is not None:
        pylab.figure(num=None,
                     figsize=figsize,
                     dpi=80,
                     facecolor='w',
                     edgecolor='k')
    fig = pylab.gcf()
    import re
    if facet is not None:
        match = re.match("(.*):(.*),(.*),(.*)", facet)
        if match:
            groups = match.groups()
            facet_expression = groups[0]
            facet_limits = [
                ast.literal_eval(groups[1]),
                ast.literal_eval(groups[2])
            ]
            facet_count = ast.literal_eval(groups[3])
            limits.append(facet_limits)
            binby.append(facet_expression)
            shape = (facet_count, ) + shape
        else:
            raise ValueError(
                "Could not understand 'facet' argument %r, expected something in form: 'column:-1,10:5'"
                % facet)

    if grid is None:
        if what:
            if isinstance(what, (vaex.stat.Expression)):
                grid = what.calculate(self,
                                      binby=binby,
                                      limits=limits,
                                      shape=shape,
                                      selection=selection)
            else:
                what = what.strip()
                index = what.index("(")
                import re
                groups = re.match("(.*)\((.*)\)", what).groups()
                if groups and len(groups) == 2:
                    function = groups[0]
                    arguments = groups[1].strip()
                    functions = ["mean", "sum", "std", "count"]
                    if function in functions:
                        # grid = getattr(self, function)(arguments, binby, limits=limits, shape=shape, selection=selection)
                        grid = getattr(vaex.stat,
                                       function)(arguments).calculate(
                                           self,
                                           binby=binby,
                                           limits=limits,
                                           shape=shape,
                                           selection=selection,
                                           progress=progress)
                    elif function == "count" and arguments == "*":
                        grid = self.count(binby=binby,
                                          shape=shape,
                                          limits=limits,
                                          selection=selection,
                                          progress=progress)
                    elif function == "cumulative" and arguments == "*":
                        # TODO: comulative should also include the tails outside limits
                        grid = self.count(binby=binby,
                                          shape=shape,
                                          limits=limits,
                                          selection=selection,
                                          progress=progress)
                        grid = np.cumsum(grid)
                    else:
                        raise ValueError(
                            "Could not understand method: %s, expected one of %r'"
                            % (function, functions))
                else:
                    raise ValueError(
                        "Could not understand 'what' argument %r, expected something in form: 'count(*)', 'mean(x)'"
                        % what)
        else:
            grid = self.histogram(binby,
                                  size=shape,
                                  limits=limits,
                                  selection=selection)
    fgrid = f(grid)
    if n is not None:
        # ngrid = n(fgrid, axis=normalize_axis)
        ngrid = fgrid / fgrid.sum()
    else:
        ngrid = fgrid
        # reductions = [_parse_reduction(r, colormap, colors) for r in reduce]
        # rgrid = ngrid * 1.
        # for r in reduce:
        #   r = _parse_reduction(r, colormap, colors)
        #   rgrid = r(rgrid)
        # grid = self.reduce(grid, )
    xmin, xmax = limits[-1]
    if facet:
        N = len(grid[-1])
    else:
        N = len(grid)
    xexpression = binby[0]
    xar = np.arange(N + 1) / (N - 0.) * (xmax - xmin) + xmin
    label = str(label or selection or x)
    if facet:
        import math
        rows, columns = int(math.ceil(facet_count / 4.)), 4
        values = np.linspace(facet_limits[0], facet_limits[1], facet_count + 1)
        for i in range(facet_count):
            ax = pylab.subplot(rows, columns, i + 1)
            value = ax.plot(xar,
                            ngrid[i],
                            drawstyle="steps-mid",
                            label=label,
                            **kwargs)
            v1, v2 = values[i], values[i + 1]
            pylab.xlabel(xlabel or x)
            pylab.ylabel(ylabel or what)
            ax.set_title("%3f <= %s < %3f" % (v1, facet_expression, v2))
            if self.iscategory(xexpression):
                labels = self.category_labels(xexpression)
                step = len(labels) // max_labels
                pylab.xticks(range(len(labels))[::step],
                             labels[::step],
                             size='small')
    else:
        # im = pylab.imshow(rgrid, extent=np.array(limits[:2]).flatten(), origin="lower", aspect=aspect)
        pylab.xlabel(xlabel or self.label(x))
        pylab.ylabel(ylabel or what)
        # print(xar, ngrid)
        # repeat the first element, that's how plot/steps likes it..
        g = np.concatenate([ngrid[0:1], ngrid])
        value = pylab.plot(xar,
                           g,
                           drawstyle="steps-pre",
                           label=label,
                           **kwargs)
        if self.iscategory(xexpression):
            labels = self.category_labels(xexpression)
            step = len(labels) // max_labels
            pylab.xticks(range(len(labels))[::step],
                         labels[::step],
                         size='small')
    if tight_layout:
        pylab.tight_layout()
    if hardcopy:
        pylab.savefig(hardcopy)
    if show:
        pylab.show()
    return value
def scatter(self,
            x,
            y,
            xerr=None,
            yerr=None,
            cov=None,
            corr=None,
            s_expr=None,
            c_expr=None,
            labels=None,
            selection=None,
            length_limit=50000,
            length_check=True,
            label=None,
            xlabel=None,
            ylabel=None,
            errorbar_kwargs={},
            ellipse_kwargs={},
            **kwargs):
    """Viz (small amounts) of data in 2d using a scatter plot

    Convenience wrapper around pylab.scatter when for working with small DataFrames or selections

    :param x: Expression for x axis
    :param y: Idem for y
    :param s_expr: When given, use if for the s (size) argument of pylab.scatter
    :param c_expr: When given, use if for the c (color) argument of pylab.scatter
    :param labels: Annotate the points with these text values
    :param selection: Single selection expression, or None
    :param length_limit: maximum number of rows it will plot
    :param length_check: should we do the maximum row check or not?
    :param label: label for the legend
    :param xlabel: label for x axis, if None .label(x) is used
    :param ylabel: label for y axis, if None .label(y) is used
    :param errorbar_kwargs: extra dict with arguments passed to plt.errorbar
    :param kwargs: extra arguments passed to pylab.scatter
    :return:
    """
    import pylab as plt
    x = _ensure_strings_from_expressions(x)
    y = _ensure_strings_from_expressions(y)
    label = str(label or selection)
    selection = _ensure_strings_from_expressions(selection)
    if length_check:
        count = self.count(selection=selection)
        if count > length_limit:
            raise ValueError(
                "the number of rows (%d) is above the limit (%d), pass length_check=False, or increase length_limit"
                % (count, length_limit))
    x_values = self.evaluate(x, selection=selection)
    y_values = self.evaluate(y, selection=selection)
    if s_expr:
        kwargs["s"] = self.evaluate(s_expr, selection=selection)
    if c_expr:
        kwargs["c"] = self.evaluate(c_expr, selection=selection)
    plt.xlabel(xlabel or self.label(x))
    plt.ylabel(ylabel or self.label(y))
    s = plt.scatter(x_values, y_values, label=label, **kwargs)
    if labels:
        label_values = self.evaluate(labels, selection=selection)
        for i, label_value in enumerate(label_values):
            plt.annotate(label_value, (x_values[i], y_values[i]))
    xerr_values = None
    yerr_values = None
    if cov is not None or corr is not None:
        from matplotlib.patches import Ellipse
        sx = self.evaluate(xerr, selection=selection)
        sy = self.evaluate(yerr, selection=selection)
        if corr is not None:
            sxy = self.evaluate(corr, selection=selection) * sx * sy
        elif cov is not None:
            sxy = self.evaluate(cov, selection=selection)
        cov_matrix = np.zeros((len(sx), 2, 2))
        cov_matrix[:, 0, 0] = sx**2
        cov_matrix[:, 1, 1] = sy**2
        cov_matrix[:, 0, 1] = cov_matrix[:, 1, 0] = sxy
        ax = plt.gca()
        ellipse_kwargs = dict(ellipse_kwargs)
        ellipse_kwargs['facecolor'] = ellipse_kwargs.get('facecolor', 'none')
        ellipse_kwargs['edgecolor'] = ellipse_kwargs.get('edgecolor', 'black')
        for i in range(len(sx)):
            eigen_values, eigen_vectors = np.linalg.eig(cov_matrix[i])
            indices = np.argsort(eigen_values)[::-1]
            eigen_values = eigen_values[indices]
            eigen_vectors = eigen_vectors[:, indices]
            v1 = eigen_vectors[:, 0]
            v2 = eigen_vectors[:, 1]
            varx = cov_matrix[i, 0, 0]
            vary = cov_matrix[i, 1, 1]
            angle = np.arctan2(v1[1], v1[0])
            # round off errors cause negative values?
            if eigen_values[1] < 0 and abs(
                (eigen_values[1] / eigen_values[0])) < 1e-10:
                eigen_values[1] = 0
            if eigen_values[0] < 0 or eigen_values[1] < 0:
                raise ValueError('neg val')
            width, height = np.sqrt(np.max(eigen_values)), np.sqrt(
                np.min(eigen_values))
            e = Ellipse(xy=(x_values[i], y_values[i]),
                        width=width,
                        height=height,
                        angle=np.degrees(angle),
                        **ellipse_kwargs)
            ax.add_artist(e)
    else:
        if xerr is not None:
            if _issequence(xerr):
                assert len(
                    xerr
                ) == 2, "if xerr is a sequence it should be of length 2"
                xerr_values = [
                    self.evaluate(xerr[0], selection=selection),
                    self.evaluate(xerr[1], selection=selection)
                ]
            else:
                xerr_values = self.evaluate(xerr, selection=selection)
        if yerr is not None:
            if _issequence(yerr):
                assert len(
                    yerr
                ) == 2, "if yerr is a sequence it should be of length 2"
                yerr_values = [
                    self.evaluate(yerr[0], selection=selection),
                    self.evaluate(yerr[1], selection=selection)
                ]
            else:
                yerr_values = self.evaluate(yerr, selection=selection)
        if xerr_values is not None or yerr_values is not None:
            errorbar_kwargs = dict(errorbar_kwargs)
            errorbar_kwargs['fmt'] = errorbar_kwargs.get('fmt', 'none')
            plt.errorbar(x_values,
                         y_values,
                         yerr=yerr_values,
                         xerr=xerr_values,
                         **errorbar_kwargs)
    return s
Example #30
0
 def evaluate(self, expression, i1=None, i2=None, out=None, selection=None, delay=False):
     expression = _ensure_strings_from_expressions(expression)
     """basic support for evaluate at server, at least to run some unittest, do not expect this to work from strings"""
     result = self.server._call_dataset("evaluate", self, expression=expression, i1=i1, i2=i2, selection=selection, delay=delay)
     # TODO: we ignore out
     return result
Example #31
0
def scatter(self, x, y, xerr=None, yerr=None, cov=None, corr=None, s_expr=None, c_expr=None, labels=None, selection=None, length_limit=50000,
    length_check=True, label=None, xlabel=None, ylabel=None, errorbar_kwargs={}, ellipse_kwargs={}, **kwargs):
    """Viz (small amounts) of data in 2d using a scatter plot

    Convenience wrapper around pylab.scatter when for working with small DataFrames or selections

    :param x: Expression for x axis
    :param y: Idem for y
    :param s_expr: When given, use if for the s (size) argument of pylab.scatter
    :param c_expr: When given, use if for the c (color) argument of pylab.scatter
    :param labels: Annotate the points with these text values
    :param selection: Single selection expression, or None
    :param length_limit: maximum number of rows it will plot
    :param length_check: should we do the maximum row check or not?
    :param label: label for the legend
    :param xlabel: label for x axis, if None .label(x) is used
    :param ylabel: label for y axis, if None .label(y) is used
    :param errorbar_kwargs: extra dict with arguments passed to plt.errorbar
    :param kwargs: extra arguments passed to pylab.scatter
    :return:
    """
    import pylab as plt
    x = _ensure_strings_from_expressions(x)
    y = _ensure_strings_from_expressions(y)
    label = str(label or selection)
    selection = _ensure_strings_from_expressions(selection)
    if length_check:
        count = self.count(selection=selection)
        if count > length_limit:
            raise ValueError("the number of rows (%d) is above the limit (%d), pass length_check=False, or increase length_limit" % (count, length_limit))
    x_values = self.evaluate(x, selection=selection)
    y_values = self.evaluate(y, selection=selection)
    if s_expr:
        kwargs["s"] = self.evaluate(s_expr, selection=selection)
    if c_expr:
        kwargs["c"] = self.evaluate(c_expr, selection=selection)
    plt.xlabel(xlabel or self.label(x))
    plt.ylabel(ylabel or self.label(y))
    s = plt.scatter(x_values, y_values, label=label, **kwargs)
    if labels:
        label_values = self.evaluate(labels, selection=selection)
        for i, label_value in enumerate(label_values):
            plt.annotate(label_value, (x_values[i], y_values[i]))
    xerr_values = None
    yerr_values = None
    if cov is not None or corr is not None:
        from matplotlib.patches import Ellipse
        sx = self.evaluate(xerr, selection=selection)
        sy = self.evaluate(yerr, selection=selection)
        if corr is not None:
            sxy = self.evaluate(corr, selection=selection) * sx * sy
        elif cov is not None:
            sxy = self.evaluate(cov, selection=selection)
        cov_matrix = np.zeros((len(sx), 2, 2))
        cov_matrix[:,0,0] = sx**2
        cov_matrix[:,1,1] = sy**2
        cov_matrix[:,0,1] = cov_matrix[:,1,0] = sxy
        ax = plt.gca()
        ellipse_kwargs = dict(ellipse_kwargs)
        ellipse_kwargs['facecolor'] = ellipse_kwargs.get('facecolor', 'none')
        ellipse_kwargs['edgecolor'] = ellipse_kwargs.get('edgecolor', 'black')
        for i in range(len(sx)):
            eigen_values, eigen_vectors = np.linalg.eig(cov_matrix[i])
            indices = np.argsort(eigen_values)[::-1]
            eigen_values = eigen_values[indices]
            eigen_vectors = eigen_vectors[:,indices]
            v1 = eigen_vectors[:, 0]
            v2 = eigen_vectors[:, 1]
            varx = cov_matrix[i, 0, 0]
            vary = cov_matrix[i, 1, 1]
            angle = np.arctan2(v1[1], v1[0])
            # round off errors cause negative values?
            if eigen_values[1] < 0 and abs((eigen_values[1]/eigen_values[0])) < 1e-10:
                eigen_values[1] = 0
            if eigen_values[0] < 0 or eigen_values[1] < 0:
                raise ValueError('neg val')
            width, height = np.sqrt(np.max(eigen_values)), np.sqrt(np.min(eigen_values))
            e = Ellipse(xy=(x_values[i], y_values[i]), width=width, height=height, angle=np.degrees(angle), **ellipse_kwargs)
            ax.add_artist(e)
    else:
        if xerr is not None:
            if _issequence(xerr):
                assert len(xerr) == 2, "if xerr is a sequence it should be of length 2"
                xerr_values = [self.evaluate(xerr[0], selection=selection), self.evaluate(xerr[1], selection=selection)]
            else:
                xerr_values = self.evaluate(xerr, selection=selection)
        if yerr is not None:
            if _issequence(yerr):
                assert len(yerr) == 2, "if yerr is a sequence it should be of length 2"
                yerr_values = [self.evaluate(yerr[0], selection=selection), self.evaluate(yerr[1], selection=selection)]
            else:
                yerr_values = self.evaluate(yerr, selection=selection)
        if xerr_values is not None or yerr_values is not None:
            errorbar_kwargs = dict(errorbar_kwargs)
            errorbar_kwargs['fmt'] = errorbar_kwargs.get('fmt', 'none')
            plt.errorbar(x_values, y_values, yerr=yerr_values, xerr=xerr_values, **errorbar_kwargs)
    return s
Example #32
0
def plot1d(self, x=None, what="count(*)", grid=None, shape=64, facet=None, limits=None, figsize=None, f="identity", n=None, normalize_axis=None,
           xlabel=None, ylabel=None, label=None,
           selection=None, show=False, tight_layout=True, hardcopy=None,
           **kwargs):
    """Viz data in 1d (histograms, running means etc)

    Example

    >>> df.plot1d(df.x)
    >>> df.plot1d(df.x, limits=[0, 100], shape=100)
    >>> df.plot1d(df.x, what='mean(y)', limits=[0, 100], shape=100)

    If you want to do a computation yourself, pass the grid argument, but you are responsible for passing the
    same limits arguments:

    >>> counts = df.mean(df.y, binby=df.x, limits=[0, 100], shape=100)/100.
    >>> df.plot1d(df.x, limits=[0, 100], shape=100, grid=means, label='mean(y)/100')

    :param x: Expression to bin in the x direction
    :param what: What to plot, count(*) will show a N-d histogram, mean('x'), the mean of the x column, sum('x') the sum
    :param grid: If the binning is done before by yourself, you can pass it
    :param facet: Expression to produce facetted plots ( facet='x:0,1,12' will produce 12 plots with x in a range between 0 and 1)
    :param limits: list of [xmin, xmax], or a description such as 'minmax', '99%'
    :param figsize: (x, y) tuple passed to pylab.figure for setting the figure size
    :param f: transform values by: 'identity' does nothing 'log' or 'log10' will show the log of the value
    :param n: normalization function, currently only 'normalize' is supported, or None for no normalization
    :param normalize_axis: which axes to normalize on, None means normalize by the global maximum.
    :param normalize_axis:
    :param xlabel: String for label on x axis (may contain latex)
    :param ylabel: Same for y axis
    :param: tight_layout: call pylab.tight_layout or not
    :param kwargs: extra argument passed to pylab.plot
    :return:
    """

    import pylab
    f = _parse_f(f)
    n = _parse_n(n)
    if type(shape) == int:
        shape = (shape,)
    binby = []
    x = _ensure_strings_from_expressions(x)
    for expression in [x]:
        if expression is not None:
            binby = [expression] + binby
    limits = self.limits(binby, limits)
    if figsize is not None:
        pylab.figure(num=None, figsize=figsize, dpi=80, facecolor='w', edgecolor='k')
    fig = pylab.gcf()
    import re
    if facet is not None:
        match = re.match("(.*):(.*),(.*),(.*)", facet)
        if match:
            groups = match.groups()
            facet_expression = groups[0]
            facet_limits = [ast.literal_eval(groups[1]), ast.literal_eval(groups[2])]
            facet_count = ast.literal_eval(groups[3])
            limits.append(facet_limits)
            binby.append(facet_expression)
            shape = (facet_count,) + shape
        else:
            raise ValueError("Could not understand 'facet' argument %r, expected something in form: 'column:-1,10:5'" % facet)

    if grid is None:
        if what:
            if isinstance(what, (vaex.stat.Expression)):
                grid = what.calculate(self, binby=binby, limits=limits, shape=shape, selection=selection)
            else:
                what = what.strip()
                index = what.index("(")
                import re
                groups = re.match("(.*)\((.*)\)", what).groups()
                if groups and len(groups) == 2:
                    function = groups[0]
                    arguments = groups[1].strip()
                    functions = ["mean", "sum", "std", "count"]
                    if function in functions:
                        # grid = getattr(self, function)(arguments, binby, limits=limits, shape=shape, selection=selection)
                        grid = getattr(vaex.stat, function)(arguments).calculate(self, binby=binby, limits=limits, shape=shape, selection=selection)
                    elif function == "count" and arguments == "*":
                        grid = self.count(binby=binby, shape=shape, limits=limits, selection=selection)
                    elif function == "cumulative" and arguments == "*":
                        # TODO: comulative should also include the tails outside limits
                        grid = self.count(binby=binby, shape=shape, limits=limits, selection=selection)
                        grid = np.cumsum(grid)
                    else:
                        raise ValueError("Could not understand method: %s, expected one of %r'" % (function, functions))
                else:
                    raise ValueError("Could not understand 'what' argument %r, expected something in form: 'count(*)', 'mean(x)'" % what)
        else:
            grid = self.histogram(binby, size=shape, limits=limits, selection=selection)
    fgrid = f(grid)
    if n is not None:
        # ngrid = n(fgrid, axis=normalize_axis)
        ngrid = fgrid / fgrid.sum()
    else:
        ngrid = fgrid
        # reductions = [_parse_reduction(r, colormap, colors) for r in reduce]
        # rgrid = ngrid * 1.
        # for r in reduce:
        #   r = _parse_reduction(r, colormap, colors)
        #   rgrid = r(rgrid)
        # grid = self.reduce(grid, )
    xmin, xmax = limits[-1]
    if facet:
        N = len(grid[-1])
    else:
        N = len(grid)
    xexpression = binby[0]
    xar = np.arange(N + 1) / (N - 0.) * (xmax - xmin) + xmin
    label = str(label or selection or x)
    if facet:
        import math
        rows, columns = int(math.ceil(facet_count / 4.)), 4
        values = np.linspace(facet_limits[0], facet_limits[1], facet_count + 1)
        for i in range(facet_count):
            ax = pylab.subplot(rows, columns, i + 1)
            value = ax.plot(xar, ngrid[i], drawstyle="steps-mid", label=label, **kwargs)
            v1, v2 = values[i], values[i + 1]
            pylab.xlabel(xlabel or x)
            pylab.ylabel(ylabel or what)
            ax.set_title("%3f <= %s < %3f" % (v1, facet_expression, v2))
            if self.iscategory(xexpression):
                labels = self.category_labels(xexpression)
                step = len(labels) // max_labels
                pylab.xticks(range(len(labels))[::step], labels[::step], size='small')
    else:
        # im = pylab.imshow(rgrid, extent=np.array(limits[:2]).flatten(), origin="lower", aspect=aspect)
        pylab.xlabel(xlabel or self.label(x))
        pylab.ylabel(ylabel or what)
        # print(xar, ngrid)
        # repeat the first element, that's how plot/steps likes it..
        g = np.concatenate([ngrid[0:1], ngrid])
        value = pylab.plot(xar, g, drawstyle="steps-pre", label=label, **kwargs)
        if self.iscategory(xexpression):
            labels = self.category_labels(xexpression)
            step = len(labels) // max_labels
            pylab.xticks(range(len(labels))[::step], labels[::step], size='small')
    if tight_layout:
        pylab.tight_layout()
    if hardcopy:
        pylab.savefig(hardcopy)
    if show:
        pylab.show()
    return value
Example #33
0
def plot(self, x=None, y=None, z=None, what="count(*)", vwhat=None, reduce=["colormap"], f=None,
         normalize="normalize", normalize_axis="what",
         vmin=None, vmax=None,
         shape=256, vshape=32, limits=None, grid=None, colormap="afmhot",  # colors=["red", "green", "blue"],
         figsize=None, xlabel=None, ylabel=None, aspect="auto", tight_layout=True, interpolation="nearest", show=False,
         colorbar=True,
         colorbar_label=None,
         selection=None, selection_labels=None, title=None,
         background_color="white", pre_blend=False, background_alpha=1.,
         visual=dict(x="x", y="y", layer="z", fade="selection", row="subspace", column="what"),
         smooth_pre=None, smooth_post=None,
         wrap=True, wrap_columns=4,
         return_extra=False, hardcopy=None):
    """Viz data in a 2d histogram/heatmap.

    Declarative plotting of statistical plots using matplotlib, supports subplots, selections, layers.

    Instead of passing x and y, pass a list as x argument for multiple panels. Give what a list of options to have multiple
    panels. When both are present then will be origanized in a column/row order.

    This methods creates a 6 dimensional 'grid', where each dimension can map the a visual dimension.
    The grid dimensions are:

     * x: shape determined by shape, content by x argument or the first dimension of each space
     * y:   ,,
     * z:  related to the z argument
     * selection: shape equals length of selection argument
     * what: shape equals length of what argument
     * space: shape equals length of x argument if multiple values are given

     By default, this its shape is (1, 1, 1, 1, shape, shape) (where x is the last dimension)

    The visual dimensions are

     * x: x coordinate on a plot / image (default maps to grid's x)
     * y: y   ,,                         (default maps to grid's y)
     * layer: each image in this dimension is blended togeher to one image (default maps to z)
     * fade: each image is shown faded after the next image (default mapt to selection)
     * row: rows of subplots (default maps to space)
     * columns: columns of subplot (default maps to what)

    All these mappings can be changes by the visual argument, some examples:

    >>> df.plot('x', 'y', what=['mean(x)', 'correlation(vx, vy)'])

    Will plot each 'what' as a column.

    >>> df.plot('x', 'y', selection=['FeH < -3', '(FeH >= -3) & (FeH < -2)'], visual=dict(column='selection'))

    Will plot each selection as a column, instead of a faded on top of each other.





    :param x: Expression to bin in the x direction (by default maps to x), or list of pairs, like [['x', 'y'], ['x', 'z']], if multiple pairs are given, this dimension maps to rows by default
    :param y:                          y           (by default maps to y)
    :param z: Expression to bin in the z direction, followed by a :start,end,shape  signature, like 'FeH:-3,1:5' will produce 5 layers between -10 and 10 (by default maps to layer)
    :param what: What to plot, count(*) will show a N-d histogram, mean('x'), the mean of the x column, sum('x') the sum, std('x') the standard deviation, correlation('vx', 'vy') the correlation coefficient. Can also be a list of values, like ['count(x)', std('vx')], (by default maps to column)
    :param reduce:
    :param f: transform values by: 'identity' does nothing 'log' or 'log10' will show the log of the value
    :param normalize: normalization function, currently only 'normalize' is supported
    :param normalize_axis: which axes to normalize on, None means normalize by the global maximum.
    :param vmin: instead of automatic normalization, (using normalize and normalization_axis) scale the data between vmin and vmax to [0, 1]
    :param vmax: see vmin
    :param shape: shape/size of the n-D histogram grid
    :param limits: list of [[xmin, xmax], [ymin, ymax]], or a description such as 'minmax', '99%'
    :param grid: if the binning is done before by yourself, you can pass it
    :param colormap: matplotlib colormap to use
    :param figsize: (x, y) tuple passed to pylab.figure for setting the figure size
    :param xlabel:
    :param ylabel:
    :param aspect:
    :param tight_layout: call pylab.tight_layout or not
    :param colorbar: plot a colorbar or not
    :param interpolation: interpolation for imshow, possible options are: 'nearest', 'bilinear', 'bicubic', see matplotlib for more
    :param return_extra:
    :return:
    """
    import pylab
    import matplotlib
    n = _parse_n(normalize)
    if type(shape) == int:
        shape = (shape,) * 2
    binby = []
    x = _ensure_strings_from_expressions(x)
    y = _ensure_strings_from_expressions(y)
    for expression in [y, x]:
        if expression is not None:
            binby = [expression] + binby
    fig = pylab.gcf()
    if figsize is not None:
        fig.set_size_inches(*figsize)
    import re

    what_units = None
    whats = _ensure_list(what)
    selections = _ensure_list(selection)
    selections = _ensure_strings_from_expressions(selections)

    if y is None:
        waslist, [x, ] = vaex.utils.listify(x)
    else:
        waslist, [x, y] = vaex.utils.listify(x, y)
        x = list(zip(x, y))
        limits = [limits]

    # every plot has its own vwhat for now
    vwhats = _expand_limits(vwhat, len(x))  # TODO: we're abusing this function..
    logger.debug("x: %s", x)
    limits, shape = self.limits(x, limits, shape=shape)
    shape = shape[0]
    logger.debug("limits: %r", limits)

    # mapping of a grid axis to a label
    labels = {}
    shape = _expand_shape(shape, 2)
    vshape = _expand_shape(shape, 2)
    if z is not None:
        match = re.match("(.*):(.*),(.*),(.*)", z)
        if match:
            groups = match.groups()
            import ast
            z_expression = groups[0]
            logger.debug("found groups: %r", list(groups))
            z_limits = [ast.literal_eval(groups[1]), ast.literal_eval(groups[2])]
            z_shape = ast.literal_eval(groups[3])
            # for pair in x:
            x = [[z_expression] + list(k) for k in x]
            limits = np.array([[z_limits] + list(k) for k in limits])
            shape = (z_shape,) + shape
            vshape = (z_shape,) + vshape
            logger.debug("x = %r", x)
            values = np.linspace(z_limits[0], z_limits[1], num=z_shape + 1)
            labels["z"] = list(["%s <= %s < %s" % (v1, z_expression, v2) for v1, v2 in zip(values[:-1], values[1:])])
        else:
            raise ValueError("Could not understand 'z' argument %r, expected something in form: 'column:-1,10:5'" % facet)
    else:
        z_shape = 1

    # z == 1
    if z is None:
        total_grid = np.zeros((len(x), len(whats), len(selections), 1) + shape, dtype=float)
        total_vgrid = np.zeros((len(x), len(whats), len(selections), 1) + vshape, dtype=float)
    else:
        total_grid = np.zeros((len(x), len(whats), len(selections)) + shape, dtype=float)
        total_vgrid = np.zeros((len(x), len(whats), len(selections)) + vshape, dtype=float)
    logger.debug("shape of total grid: %r", total_grid.shape)
    axis = dict(plot=0, what=1, selection=2)
    xlimits = limits

    grid_axes = dict(x=-1, y=-2, z=-3, selection=-4, what=-5, subspace=-6)
    visual_axes = dict(x=-1, y=-2, layer=-3, fade=-4, column=-5, row=-6)
    # visual_default=dict(x="x", y="y", z="layer", selection="fade", subspace="row", what="column")
    # visual: mapping of a plot axis, to a grid axis
    visual_default = dict(x="x", y="y", layer="z", fade="selection", row="subspace", column="what")

    def invert(x): return dict((v, k) for k, v in x.items())
    # visual_default_reverse = invert(visual_default)
    # visual_ = visual_default
    # visual = dict(visual) # copy for modification
    # add entries to avoid mapping multiple times to the same axis
    free_visual_axes = list(visual_default.keys())
    # visual_reverse = invert(visual)
    logger.debug("1: %r %r", visual, free_visual_axes)
    for visual_name, grid_name in visual.items():
        if visual_name in free_visual_axes:
            free_visual_axes.remove(visual_name)
        else:
            raise ValueError("visual axes %s used multiple times" % visual_name)
    logger.debug("2: %r %r", visual, free_visual_axes)
    for visual_name, grid_name in visual_default.items():
        if visual_name in free_visual_axes and grid_name not in visual.values():
            free_visual_axes.remove(visual_name)
            visual[visual_name] = grid_name
    logger.debug("3: %r %r", visual, free_visual_axes)
    for visual_name, grid_name in visual_default.items():
        if visual_name not in free_visual_axes and grid_name not in visual.values():
            visual[free_visual_axes.pop(0)] = grid_name

    logger.debug("4: %r %r", visual, free_visual_axes)

    visual_reverse = invert(visual)
    # TODO: the meaning of visual and visual_reverse is changed below this line, super confusing
    visual, visual_reverse = visual_reverse, visual
    # so now, visual: mapping of a grid axis to plot axis
    # visual_reverse: mapping of a grid axis to plot axis
    move = {}
    for grid_name, visual_name in visual.items():
        if visual_axes[visual_name] in visual.values():
            index = visual.values().find(visual_name)
            key = visual.keys()[index]
            raise ValueError("trying to map %s to %s while, it is already mapped by %s" % (grid_name, visual_name, key))
        move[grid_axes[grid_name]] = visual_axes[visual_name]

    # normalize_axis = _ensure_list(normalize_axis)

    fs = _expand(f, total_grid.shape[grid_axes[normalize_axis]])
    # assert len(vwhat)
    # labels["y"] = ylabels
    what_labels = []
    if grid is None:
        grid_of_grids = []
        for i, (binby, limits) in enumerate(zip(x, xlimits)):
            grid_of_grids.append([])
            for j, what in enumerate(whats):
                if isinstance(what, vaex.stat.Expression):
                    grid = what.calculate(self, binby=binby, shape=shape, limits=limits, selection=selections, delay=True)
                else:
                    what = what.strip()
                    index = what.index("(")
                    import re
                    groups = re.match("(.*)\((.*)\)", what).groups()
                    if groups and len(groups) == 2:
                        function = groups[0]
                        arguments = groups[1].strip()
                        if "," in arguments:
                            arguments = arguments.split(",")
                        functions = ["mean", "sum", "std", "var", "correlation", "covar", "min", "max", "median_approx"]
                        unit_expression = None
                        if function in ["mean", "sum", "std", "min", "max", "median"]:
                            unit_expression = arguments
                        if function in ["var"]:
                            unit_expression = "(%s) * (%s)" % (arguments, arguments)
                        if function in ["covar"]:
                            unit_expression = "(%s) * (%s)" % arguments
                        if unit_expression:
                            unit = self.unit(unit_expression)
                            if unit:
                                what_units = unit.to_string('latex_inline')
                        if function in functions:
                            grid = getattr(self, function)(arguments, binby=binby, limits=limits, shape=shape, selection=selections, delay=True)
                        elif function == "count":
                            grid = self.count(arguments, binby, shape=shape, limits=limits, selection=selections, delay=True)
                        else:
                            raise ValueError("Could not understand method: %s, expected one of %r'" % (function, functions))
                    else:
                        raise ValueError("Could not understand 'what' argument %r, expected something in form: 'count(*)', 'mean(x)'" % what)
                if i == 0:  # and j == 0:
                    what_label = str(whats[j])
                    if what_units:
                        what_label += " (%s)" % what_units
                    if fs[j]:
                        what_label = fs[j] + " " + what_label
                    what_labels.append(what_label)
                grid_of_grids[-1].append(grid)
        self.executor.execute()
        for i, (binby, limits) in enumerate(zip(x, xlimits)):
            for j, what in enumerate(whats):
                grid = grid_of_grids[i][j].get()
                total_grid[i, j, :, :] = grid[:, None, ...]
        labels["what"] = what_labels
    else:
        dims_left = 6 - len(grid.shape)
        total_grid = np.broadcast_to(grid, (1,) * dims_left + grid.shape)

    #           visual=dict(x="x", y="y", selection="fade", subspace="facet1", what="facet2",)
    def _selection_name(name):
        if name in [None, False]:
            return "selection: all"
        elif name in ["default", True]:
            return "selection: default"
        else:
            return "selection: %s" % name
    if selection_labels is None:
        labels["selection"] = list([_selection_name(k) for k in selections])
    else:
        labels["selection"] = selection_labels

    # visual_grid = np.moveaxis(total_grid, move.keys(), move.values())
    # np.moveaxis is in np 1.11 only?, use transpose
    axes = [None] * len(move)
    for key, value in move.items():
        axes[value] = key
    visual_grid = np.transpose(total_grid, axes)

    logger.debug("grid shape: %r", total_grid.shape)
    logger.debug("visual: %r", visual.items())
    logger.debug("move: %r", move)
    logger.debug("visual grid shape: %r", visual_grid.shape)

    xexpressions = []
    yexpressions = []
    for i, (binby, limits) in enumerate(zip(x, xlimits)):
        xexpressions.append(binby[0])
        yexpressions.append(binby[1])
    if xlabel is None:
        xlabels = []
        ylabels = []
        for i, (binby, limits) in enumerate(zip(x, xlimits)):
            if z is not None:
                xlabels.append(self.label(binby[1]))
                ylabels.append(self.label(binby[2]))
            else:
                xlabels.append(self.label(binby[0]))
                ylabels.append(self.label(binby[1]))
    else:
        Nl = visual_grid.shape[visual_axes['row']]
        xlabels = _expand(xlabel, Nl)
        ylabels = _expand(ylabel, Nl)
    #labels[visual["x"]] = (xlabels, ylabels)
    labels["x"] = xlabels
    labels["y"] = ylabels

    # grid = total_grid
    # print(grid.shape)
    # grid = self.reduce(grid, )
    axes = []
    # cax = pylab.subplot(1,1,1)

    background_color = np.array(matplotlib.colors.colorConverter.to_rgb(background_color))

    # if grid.shape[axis["selection"]] > 1:#  and not facet:
    #   rgrid = vaex.image.fade(rgrid)
    #   finite_mask = np.any(finite_mask, axis=0) # do we really need this
    #   print(rgrid.shape)
    # facet_row_axis = axis["what"]
    import math
    facet_columns = None
    facets = visual_grid.shape[visual_axes["row"]] * visual_grid.shape[visual_axes["column"]]
    if visual_grid.shape[visual_axes["column"]] == 1 and wrap:
        facet_columns = min(wrap_columns, visual_grid.shape[visual_axes["row"]])
        wrapped = True
    elif visual_grid.shape[visual_axes["row"]] == 1 and wrap:
        facet_columns = min(wrap_columns, visual_grid.shape[visual_axes["column"]])
        wrapped = True
    else:
        wrapped = False
        facet_columns = visual_grid.shape[visual_axes["column"]]
    facet_rows = int(math.ceil(facets / facet_columns))
    logger.debug("facet_rows: %r", facet_rows)
    logger.debug("facet_columns: %r", facet_columns)
    # if visual_grid.shape[visual_axes["row"]] > 1: # and not wrap:
    #   #facet_row_axis = axis["what"]
    #   facet_columns = visual_grid.shape[visual_axes["column"]]
    # else:
    #   facet_columns = min(wrap_columns, facets)
    # if grid.shape[axis["plot"]] > 1:#  and not facet:

    # this loop could be done using axis arguments everywhere
    # assert len(normalize_axis) == 1, "currently only 1 normalization axis supported"
    grid = visual_grid * 1.
    fgrid = visual_grid * 1.
    ngrid = visual_grid * 1.
    # colorgrid = np.zeros(ngrid.shape + (4,), float)
    # print "norma", normalize_axis, visual_grid.shape[visual_axes[visual[normalize_axis]]]
    vmins = _expand(vmin, visual_grid.shape[visual_axes[visual[normalize_axis]]], type=list)
    vmaxs = _expand(vmax, visual_grid.shape[visual_axes[visual[normalize_axis]]], type=list)
    # for name in normalize_axis:
    visual_grid
    if smooth_pre:
        grid = vaex.grids.gf(grid, smooth_pre)
    if 1:
        axis = visual_axes[visual[normalize_axis]]
        for i in range(visual_grid.shape[axis]):
            item = [slice(None, None, None), ] * len(visual_grid.shape)
            item[axis] = i
            item = tuple(item)
            f = _parse_f(fs[i])
            with np.errstate(divide='ignore', invalid='ignore'):  # these are fine, we are ok with nan's in vaex
                fgrid.__setitem__(item, f(grid.__getitem__(item)))
            # print vmins[i], vmaxs[i]
            if vmins[i] is not None and vmaxs[i] is not None:
                nsubgrid = fgrid.__getitem__(item) * 1
                nsubgrid -= vmins[i]
                nsubgrid /= (vmaxs[i] - vmins[i])
            else:
                nsubgrid, vmin, vmax = n(fgrid.__getitem__(item))
                vmins[i] = vmin
                vmaxs[i] = vmax
            # print "    ", vmins[i], vmaxs[i]
            ngrid.__setitem__(item, nsubgrid)

    if 0:  # TODO: above should be like the code below, with custom vmin and vmax
        grid = visual_grid[i]
        f = _parse_f(fs[i])
        fgrid = f(grid)
        finite_mask = np.isfinite(grid)
        finite_mask = np.any(finite_mask, axis=0)
        if vmin is not None and vmax is not None:
            ngrid = fgrid * 1
            ngrid -= vmin
            ngrid /= (vmax - vmin)
            ngrid = np.clip(ngrid, 0, 1)
        else:
            ngrid, vmin, vmax = n(fgrid)
            # vmin, vmax = np.nanmin(fgrid), np.nanmax(fgrid)
    # every 'what', should have its own colorbar, check if what corresponds to
    # rows or columns in facets, if so, do a colorbar per row or per column

    rows, columns = int(math.ceil(facets / float(facet_columns))), facet_columns
    colorbar_location = "individual"
    if visual["what"] == "row" and visual_grid.shape[1] == facet_columns:
        colorbar_location = "per_row"
    if visual["what"] == "column" and visual_grid.shape[0] == facet_rows:
        colorbar_location = "per_column"
    # values = np.linspace(facet_limits[0], facet_limits[1], facet_count+1)
    logger.debug("rows: %r, columns: %r", rows, columns)
    import matplotlib.gridspec as gridspec
    column_scale = 1
    row_scale = 1
    row_offset = 0
    if facets > 1:
        if colorbar_location == "per_row":
            column_scale = 4
            gs = gridspec.GridSpec(rows, columns * column_scale + 1)
        elif colorbar_location == "per_column":
            row_offset = 1
            row_scale = 4
            gs = gridspec.GridSpec(rows * row_scale + 1, columns)
        else:
            gs = gridspec.GridSpec(rows, columns)
    facet_index = 0
    fs = _expand(f, len(whats))
    colormaps = _expand(colormap, len(whats))

    # row
    for i in range(visual_grid.shape[0]):
        # column
        for j in range(visual_grid.shape[1]):
            if colorbar and colorbar_location == "per_column" and i == 0:
                norm = matplotlib.colors.Normalize(vmins[j], vmaxs[j])
                sm = matplotlib.cm.ScalarMappable(norm, colormaps[j])
                sm.set_array(1)  # make matplotlib happy (strange behavious)
                if facets > 1:
                    ax = pylab.subplot(gs[0, j])
                    colorbar = fig.colorbar(sm, cax=ax, orientation="horizontal")
                else:
                    colorbar = fig.colorbar(sm)
                if "what" in labels:
                    label = labels["what"][j]
                    if facets > 1:
                        colorbar.ax.set_title(label)
                    else:
                        colorbar.ax.set_ylabel(colorbar_label or label)

            if colorbar and colorbar_location == "per_row" and j == 0:
                norm = matplotlib.colors.Normalize(vmins[i], vmaxs[i])
                sm = matplotlib.cm.ScalarMappable(norm, colormaps[i])
                sm.set_array(1)  # make matplotlib happy (strange behavious)
                if facets > 1:
                    ax = pylab.subplot(gs[i, -1])
                    colorbar = fig.colorbar(sm, cax=ax)
                else:
                    colorbar = fig.colorbar(sm)
                label = labels["what"][i]
                colorbar.ax.set_ylabel(colorbar_label or label)

            rgrid = ngrid[i, j] * 1.
            # print rgrid.shape
            for k in range(rgrid.shape[0]):
                for l in range(rgrid.shape[0]):
                    if smooth_post is not None:
                        rgrid[k, l] = vaex.grids.gf(rgrid, smooth_post)
            if visual["what"] == "column":
                what_index = j
            elif visual["what"] == "row":
                what_index = i
            else:
                what_index = 0

            if visual[normalize_axis] == "column":
                normalize_index = j
            elif visual[normalize_axis] == "row":
                normalize_index = i
            else:
                normalize_index = 0
            for r in reduce:
                r = _parse_reduction(r, colormaps[what_index], [])
                rgrid = r(rgrid)

            row = facet_index // facet_columns
            column = facet_index % facet_columns

            if colorbar and colorbar_location == "individual":
                # visual_grid.shape[visual_axes[visual[normalize_axis]]]
                norm = matplotlib.colors.Normalize(vmins[normalize_index], vmaxs[normalize_index])
                sm = matplotlib.cm.ScalarMappable(norm, colormaps[what_index])
                sm.set_array(1)  # make matplotlib happy (strange behavious)
                if facets > 1:
                    ax = pylab.subplot(gs[row, column])
                    colorbar = fig.colorbar(sm, ax=ax)
                else:
                    colorbar = fig.colorbar(sm)
                label = labels["what"][what_index]
                colorbar.ax.set_ylabel(colorbar_label or label)

            if facets > 1:
                ax = pylab.subplot(gs[row_offset + row * row_scale:row_offset + (row + 1) * row_scale, column * column_scale:(column + 1) * column_scale])
            else:
                ax = pylab.gca()
            axes.append(ax)
            logger.debug("rgrid: %r", rgrid.shape)
            plot_rgrid = rgrid
            assert plot_rgrid.shape[1] == 1, "no layers supported yet"
            plot_rgrid = plot_rgrid[:, 0]
            if plot_rgrid.shape[0] > 1:
                plot_rgrid = vaex.image.fade(plot_rgrid[::-1])
            else:
                plot_rgrid = plot_rgrid[0]
            extend = None
            if visual["subspace"] == "row":
                subplot_index = i
            elif visual["subspace"] == "column":
                subplot_index = j
            else:
                subplot_index = 0
            extend = np.array(xlimits[subplot_index][-2:]).flatten()
            #   extend = np.array(xlimits[i]).flatten()
            logger.debug("plot rgrid: %r", plot_rgrid.shape)
            plot_rgrid = np.transpose(plot_rgrid, (1, 0, 2))
            im = ax.imshow(plot_rgrid, extent=extend.tolist(), origin="lower", aspect=aspect, interpolation=interpolation)
            # v1, v2 = values[i], values[i+1]

            def label(index, label, expression):
                if label and _issequence(label):
                    return label[i]
                else:
                    return self.label(expression)
            if visual_reverse["x"] =='x':
                labelsx = labels['x']
                pylab.xlabel(labelsx[subplot_index])
            if visual_reverse["x"] =='x':
                labelsy = labels['y']
                pylab.ylabel(labelsy[subplot_index])
            if visual["z"] in ['row']:
                labelsz = labels['z']
                ax.set_title(labelsz[i])
            if visual["z"] in ['column']:
                labelsz = labels['z']
                ax.set_title(labelsz[j])

            max_labels = 10
            # xexpression = xexpressions[i]
            # if self.iscategory(xexpression):
            #     labels = self.category_labels(xexpression)
            #     step = len(labels) // max_labels
            #     pylab.xticks(np.arange(len(labels))[::step], labels[::step], size='small')
            # yexpression = yexpressions[i]
            # if self.iscategory(yexpression):
            #     labels = self.category_labels(yexpression)
            #     step = len(labels) // max_labels
            #     pylab.yticks(np.arange(len(labels))[::step], labels[::step], size='small')
            facet_index += 1
    if title:
        fig.suptitle(title, fontsize="x-large")
    if tight_layout:
        if title:
            pylab.tight_layout(rect=[0, 0.03, 1, 0.95])
        else:
            pylab.tight_layout()
    if hardcopy:
        pylab.savefig(hardcopy)
    if show:
        pylab.show()
    if return_extra:
        return im, grid, fgrid, ngrid, rgrid
    else:
        return im