def test_process_dataframe_output_default(smalldf): data_array, dtypes, distargs, converters = du.process_dataframe(smalldf, 2) assert data_array.shape == smalldf.shape assert 'float' in str(data_array.dtype) assert dtypes[0] == 'continuous' assert dtypes[1] == 'categorical' assert dtypes[2] == 'categorical' assert dtypes[3] == 'continuous' # distargs for continuous is irrelevant, distargs for categorical should be # the number of values assert len(distargs[0]) == 1 # continuous assert distargs[0][0] == 0 assert len(distargs[1]) == 1 # categorical assert distargs[1][0] == 2 assert len(distargs[2]) == 1 # categorical assert distargs[2][0] == 3 assert len(distargs[3]) == 1 # continuous assert distargs[3][0] == 0 # converters assert len(converters['col2idx']) == smalldf.shape[1] assert len(converters['idx2col']) == smalldf.shape[1] assert len(converters['valmaps']) == 2 # number of categorical cols
def __init__(self, df=None, metadata=None, n_models=8, **kwargs): """ Parameters ---------- df : file name, pandas.DataFrame The data for inference. metadata : dict Column metadata to speed processing. Providing more data in `metadata` speeds up the processing of `df` by obviating the need to infer data types and create value maps. n_models : int The number of models (states/samples) over which to average. Each model represents a sample from an independent Markov Chain. seed : integer Positive integer seed for the random number generators. mapper : callable A map function that returns a list. For example: `mp.Pool.map` or `lambda f, args: list(map(f, args))`. use_mp : bool, optional If True (default), model-parallel tasts are run in parallel. index_col : int or None If `df` is a file name, index col is the integer index of the index column. Assumes the first columns (0) by default. Examples -------- Initialize with partial metadata >>> import pandas as pd >>> df = pd.read_csv('examples/animals/animals.csv', index_col=0) >>> metadata = { ... 'stripes': { ... 'dtype': 'categorical', ... 'values': [0, 1] ... } ... } >>> engine = Engine(df, metadata) Initialize with IPython Parallel mapper >>> import pandas as pd >>> import ipyparallel as ipp >>> df = pd.read_csv('examples/animals/animals.csv', index_col=0) >>> c = ipp.Client() >>> engine = Engine(df, mapper=c.map) """ if df is None: raise ValueError('Give me some data (;-_-)') if isinstance(df, str): df = pd.read_csv(df, index_col=kwargs.get('index_col', 0)) self._init_args = {'df': df, 'metadata': metadata, 'kwargs': kwargs} guess_n_unique_cutoff = kwargs.get('guess_n_unique_cutoff', 20) use_mp = kwargs.get('use_mp', True) mapper = kwargs.get('mapper', None) output = du.process_dataframe(df, n_models, metadata, guess_n_unique_cutoff) self._data, self._dtypes, self._distargs, self._converters = output self._df = df self._n_rows, self._n_cols = self._df.shape self._metadata = metadata self._row_names = df.index self._col_names = df.columns self._seed = kwargs.get('seed', None) if self._seed is not None: np.random.seed(self._seed) random.seed(self._seed) self._pool = None if mapper is None: if use_mp: self._pool = Pool() self._mapper = self._pool.map else: self._mapper = lambda func, args: list(map(func, args)) else: self._mapper = mapper self._initialized = False self._models = [] self._n_models = n_models self._diagnostic_tables = []