Example #1
0
def test_process_dataframe_output_default(smalldf):
    data_array, dtypes, distargs, converters = du.process_dataframe(smalldf, 2)

    assert data_array.shape == smalldf.shape
    assert 'float' in str(data_array.dtype)

    assert dtypes[0] == 'continuous'
    assert dtypes[1] == 'categorical'
    assert dtypes[2] == 'categorical'
    assert dtypes[3] == 'continuous'

    # distargs for continuous is irrelevant, distargs for categorical should be
    # the number of values
    assert len(distargs[0]) == 1  # continuous
    assert distargs[0][0] == 0
    assert len(distargs[1]) == 1  # categorical
    assert distargs[1][0] == 2
    assert len(distargs[2]) == 1  # categorical
    assert distargs[2][0] == 3
    assert len(distargs[3]) == 1  # continuous
    assert distargs[3][0] == 0

    # converters
    assert len(converters['col2idx']) == smalldf.shape[1]
    assert len(converters['idx2col']) == smalldf.shape[1]
    assert len(converters['valmaps']) == 2  # number of categorical cols
def test_process_dataframe_output_default(smalldf):
    data_array, dtypes, distargs, converters = du.process_dataframe(smalldf, 2)

    assert data_array.shape == smalldf.shape
    assert 'float' in str(data_array.dtype)

    assert dtypes[0] == 'continuous'
    assert dtypes[1] == 'categorical'
    assert dtypes[2] == 'categorical'
    assert dtypes[3] == 'continuous'

    # distargs for continuous is irrelevant, distargs for categorical should be
    # the number of values
    assert len(distargs[0]) == 1  # continuous
    assert distargs[0][0] == 0
    assert len(distargs[1]) == 1  # categorical
    assert distargs[1][0] == 2
    assert len(distargs[2]) == 1  # categorical
    assert distargs[2][0] == 3
    assert len(distargs[3]) == 1  # continuous
    assert distargs[3][0] == 0

    # converters
    assert len(converters['col2idx']) == smalldf.shape[1]
    assert len(converters['idx2col']) == smalldf.shape[1]
    assert len(converters['valmaps']) == 2  # number of categorical cols
Example #3
0
    def __init__(self, df=None, metadata=None, n_models=8, **kwargs):
        """
        Parameters
        ----------
        df : file name, pandas.DataFrame
            The data for inference.
        metadata : dict
            Column metadata to speed processing. Providing more data in
            `metadata` speeds up the processing of `df` by obviating the need
            to infer data types and create value maps.
        n_models : int
            The number of models (states/samples) over which to average. Each
            model represents a sample from an independent Markov Chain.
        seed : integer
            Positive integer seed for the random number generators.
        mapper : callable
            A map function that returns a list. For example: `mp.Pool.map` or
            `lambda f, args: list(map(f, args))`.
        use_mp : bool, optional
            If True (default), model-parallel tasts are run in parallel.
        index_col : int or None
            If `df` is a file name, index col is the integer index of the
            index column. Assumes the first columns (0) by default.

        Examples
        --------
        Initialize with partial metadata

        >>> import pandas as pd
        >>> df = pd.read_csv('examples/animals/animals.csv', index_col=0)
        >>> metadata = {
        ...     'stripes': {
        ...         'dtype': 'categorical',
        ...         'values': [0, 1]
        ...     }
        ... }
        >>> engine = Engine(df, metadata)

        Initialize with IPython Parallel mapper

        >>> import pandas as pd
        >>> import ipyparallel as ipp
        >>> df = pd.read_csv('examples/animals/animals.csv', index_col=0)
        >>> c = ipp.Client()
        >>> engine = Engine(df, mapper=c.map)
        """

        if df is None:
            raise ValueError('Give me some data (;-_-)')

        if isinstance(df, str):
            df = pd.read_csv(df, index_col=kwargs.get('index_col', 0))

        self._init_args = {'df': df, 'metadata': metadata, 'kwargs': kwargs}

        guess_n_unique_cutoff = kwargs.get('guess_n_unique_cutoff', 20)
        use_mp = kwargs.get('use_mp', True)
        mapper = kwargs.get('mapper', None)

        output = du.process_dataframe(df, n_models, metadata,
                                      guess_n_unique_cutoff)
        self._data, self._dtypes, self._distargs, self._converters = output

        self._df = df
        self._n_rows, self._n_cols = self._df.shape
        self._metadata = metadata

        self._row_names = df.index
        self._col_names = df.columns

        self._seed = kwargs.get('seed', None)

        if self._seed is not None:
            np.random.seed(self._seed)
            random.seed(self._seed)

        self._pool = None
        if mapper is None:
            if use_mp:
                self._pool = Pool()
                self._mapper = self._pool.map
            else:
                self._mapper = lambda func, args: list(map(func, args))
        else:
            self._mapper = mapper

        self._initialized = False
        self._models = []
        self._n_models = n_models
        self._diagnostic_tables = []
Example #4
0
    def __init__(self, df=None, metadata=None, n_models=8, **kwargs):
        """
        Parameters
        ----------
        df : file name, pandas.DataFrame
            The data for inference.
        metadata : dict
            Column metadata to speed processing. Providing more data in
            `metadata` speeds up the processing of `df` by obviating the need
            to infer data types and create value maps.
        n_models : int
            The number of models (states/samples) over which to average. Each
            model represents a sample from an independent Markov Chain.
        seed : integer
            Positive integer seed for the random number generators.
        mapper : callable
            A map function that returns a list. For example: `mp.Pool.map` or
            `lambda f, args: list(map(f, args))`.
        use_mp : bool, optional
            If True (default), model-parallel tasts are run in parallel.
        index_col : int or None
            If `df` is a file name, index col is the integer index of the
            index column. Assumes the first columns (0) by default.

        Examples
        --------
        Initialize with partial metadata

        >>> import pandas as pd
        >>> df = pd.read_csv('examples/animals/animals.csv', index_col=0)
        >>> metadata = {
        ...     'stripes': {
        ...         'dtype': 'categorical',
        ...         'values': [0, 1]
        ...     }
        ... }
        >>> engine = Engine(df, metadata)

        Initialize with IPython Parallel mapper

        >>> import pandas as pd
        >>> import ipyparallel as ipp
        >>> df = pd.read_csv('examples/animals/animals.csv', index_col=0)
        >>> c = ipp.Client()
        >>> engine = Engine(df, mapper=c.map)
        """

        if df is None:
            raise ValueError('Give me some data (;-_-)')

        if isinstance(df, str):
            df = pd.read_csv(df, index_col=kwargs.get('index_col', 0))

        self._init_args = {'df': df, 'metadata': metadata, 'kwargs': kwargs}

        guess_n_unique_cutoff = kwargs.get('guess_n_unique_cutoff', 20)
        use_mp = kwargs.get('use_mp', True)
        mapper = kwargs.get('mapper', None)

        output = du.process_dataframe(df, n_models, metadata,
                                      guess_n_unique_cutoff)
        self._data, self._dtypes, self._distargs, self._converters = output

        self._df = df
        self._n_rows, self._n_cols = self._df.shape
        self._metadata = metadata

        self._row_names = df.index
        self._col_names = df.columns

        self._seed = kwargs.get('seed', None)

        if self._seed is not None:
            np.random.seed(self._seed)
            random.seed(self._seed)

        self._pool = None
        if mapper is None:
            if use_mp:
                self._pool = Pool()
                self._mapper = self._pool.map
            else:
                self._mapper = lambda func, args: list(map(func, args))
        else:
            self._mapper = mapper

        self._initialized = False
        self._models = []
        self._n_models = n_models
        self._diagnostic_tables = []