def data_frames(columns=None, rows=None, index=None): """Provides a strategy for producing a :class:`pandas.DataFrame`. Arguments: * columns: An iterable of :class:`column` objects describing the shape of the generated DataFrame. * rows: A strategy for generating a row object. Should generate either dicts mapping column names to values or a sequence mapping column position to the value in that position (note that unlike the :class:`pandas.DataFrame` constructor, single values are not allowed here. Passing e.g. an integer is an error, even if there is only one column). At least one of rows and columns must be provided. If both are provided then the generated rows will be validated against the columns and an error will be raised if they don't match. Caveats on using rows: * In general you should prefer using columns to rows, and only use rows if the columns interface is insufficiently flexible to describe what you need - you will get better performance and example quality that way. * If you provide rows and not columns, then the shape and dtype of the resulting DataFrame may vary. e.g. if you have a mix of int and float in the values for one column in your row entries, the column will sometimes have an integral dtype and sometimes a float. * index: If not None, a strategy for generating indexes for the resulting DataFrame. This can generate either :class:`pandas.Index` objects or any sequence of values (which will be passed to the Index constructor). You will probably find it most convenient to use the :func:`~hypothesis.extra.pandas.indexes` or :func:`~hypothesis.extra.pandas.range_indexes` function to produce values for this argument. Usage: The expected usage pattern is that you use :class:`column` and :func:`columns` to specify a fixed shape of the DataFrame you want as follows. For example the following gives a two column data frame: .. code-block:: pycon >>> from hypothesis.extra.pandas import column, data_frames >>> data_frames([ ... column('A', dtype=int), column('B', dtype=float)]).example() A B 0 2021915903 1.793898e+232 1 1146643993 inf 2 -2096165693 1.000000e+07 If you want the values in different columns to interact in some way you can use the rows argument. For example the following gives a two column DataFrame where the value in the first column is always at most the value in the second: .. code-block:: pycon >>> from hypothesis.extra.pandas import column, data_frames >>> import hypothesis.strategies as st >>> data_frames( ... rows=st.tuples(st.floats(allow_nan=False), ... st.floats(allow_nan=False)).map(sorted) ... ).example() 0 1 0 -3.402823e+38 9.007199e+15 1 -1.562796e-298 5.000000e-01 You can also combine the two: .. code-block:: pycon >>> from hypothesis.extra.pandas import column, data_frames >>> import hypothesis.strategies as st >>> data_frames( ... columns=columns(["lo", "hi"], dtype=float), ... rows=st.tuples(st.floats(allow_nan=False), ... st.floats(allow_nan=False)).map(sorted) ... ).example() lo hi 0 9.314723e-49 4.353037e+45 1 -9.999900e-01 1.000000e+07 2 -2.152861e+134 -1.069317e-73 (Note that the column dtype must still be specified and will not be inferred from the rows. This restriction may be lifted in future). Combining rows and columns has the following behaviour: * The column names and dtypes will be used. * If the column is required to be unique, this will be enforced. * Any values missing from the generated rows will be provided using the column's fill. * Any values in the row not present in the column specification (if dicts are passed, if there are keys with no corresponding column name, if sequences are passed if there are too many items) will result in InvalidArgument being raised. """ if index is None: index = range_indexes() else: st.check_strategy(index) index_strategy = index if columns is None: if rows is None: raise InvalidArgument( 'At least one of rows and columns must be provided') else: @st.composite def rows_only(draw): index = draw(index_strategy) @check_function def row(): result = draw(rows) st.check_type(Iterable, result, 'draw(row)') return result if len(index) > 0: return pandas.DataFrame([row() for _ in index], index=index) else: # If we haven't drawn any rows we need to draw one row and # then discard it so that we get a consistent shape for the # DataFrame. base = pandas.DataFrame([row()]) return base.drop(0) return rows_only() assert columns is not None columns = st.try_convert(tuple, columns, 'columns') rewritten_columns = [] column_names = set() for i, c in enumerate(columns): st.check_type(column, c, 'columns[%d]' % (i, )) c = copy(c) if c.name is None: label = 'columns[%d]' % (i, ) c.name = i else: label = c.name try: hash(c.name) except TypeError: raise InvalidArgument( 'Column names must be hashable, but columns[%d].name was ' '%r of type %s, which cannot be hashed.' % ( i, c.name, type(c.name).__name__, )) if c.name in column_names: raise InvalidArgument('duplicate definition of column name %r' % (c.name, )) column_names.add(c.name) c.elements, c.dtype = elements_and_dtype(c.elements, c.dtype, label) if c.dtype is None and rows is not None: raise InvalidArgument( 'Must specify a dtype for all columns when combining rows with' ' columns.') c.fill = npst.fill_for(fill=c.fill, elements=c.elements, unique=c.unique, name=label) rewritten_columns.append(c) if rows is None: @st.composite def just_draw_columns(draw): index = draw(index_strategy) local_index_strategy = st.just(index) data = OrderedDict((c.name, None) for c in rewritten_columns) # Depending on how the columns are going to be generated we group # them differently to get better shrinking. For columns with fill # enabled, the elements can be shrunk independently of the size, # so we can just shrink by shrinking the index then shrinking the # length and are generally much more free to move data around. # For columns with no filling the problem is harder, and drawing # them like that would result in rows being very far apart from # eachother in the underlying data stream, which gets in the way # of shrinking. So what we do is reorder and draw those columns # row wise, so that the values of each row are next to each other. # This makes life easier for the shrinker when deleting blocks of # data. columns_without_fill = [ c for c in rewritten_columns if c.fill.is_empty ] if columns_without_fill: for c in columns_without_fill: data[c.name] = pandas.Series( np.zeros(shape=len(index), dtype=c.dtype), index=index, ) seen = { c.name: set() for c in columns_without_fill if c.unique } for i in hrange(len(index)): for c in columns_without_fill: if c.unique: for _ in range(5): value = draw(c.elements) if value not in seen[c.name]: seen[c.name].add(value) break else: reject() else: value = draw(c.elements) data[c.name][i] = value for c in rewritten_columns: if not c.fill.is_empty: data[c.name] = draw( series(index=local_index_strategy, dtype=c.dtype, elements=c.elements, fill=c.fill, unique=c.unique)) return pandas.DataFrame(data, index=index) return just_draw_columns() else: @st.composite def assign_rows(draw): index = draw(index_strategy) result = pandas.DataFrame(OrderedDict( (c.name, pandas.Series(np.zeros(dtype=c.dtype, shape=len(index)), dtype=c.dtype)) for c in rewritten_columns), index=index) fills = {} any_unique = any(c.unique for c in rewritten_columns) if any_unique: all_seen = [ set() if c.unique else None for c in rewritten_columns ] while all_seen[-1] is None: all_seen.pop() for row_index in hrange(len(index)): for _ in hrange(5): original_row = draw(rows) row = original_row if isinstance(row, dict): as_list = [None] * len(rewritten_columns) for i, c in enumerate(rewritten_columns): try: as_list[i] = row[c.name] except KeyError: try: as_list[i] = fills[i] except KeyError: fills[i] = draw(c.fill) as_list[i] = fills[i] for k in row: if k not in column_names: raise InvalidArgument( ('Row %r contains column %r not in ' 'columns %r)' % (row, k, [c.name for c in rewritten_columns]))) row = as_list if any_unique: has_duplicate = False for seen, value in zip(all_seen, row): if seen is None: continue if value in seen: has_duplicate = True break seen.add(value) if has_duplicate: continue row = list(st.try_convert(tuple, row, 'draw(rows)')) if len(row) > len(rewritten_columns): raise InvalidArgument( ('Row %r contains too many entries. Has %d but ' 'expected at most %d') % (original_row, len(row), len(rewritten_columns))) while len(row) < len(rewritten_columns): row.append(draw(rewritten_columns[len(row)].fill)) result.iloc[row_index] = row break else: reject() return result return assign_rows()
def data_frames( columns=None, rows=None, index=None ): """Provides a strategy for producing a :class:`pandas.DataFrame`. Arguments: * columns: An iterable of :class:`column` objects describing the shape of the generated DataFrame. * rows: A strategy for generating a row object. Should generate either dicts mapping column names to values or a sequence mapping column position to the value in that position (note that unlike the :class:`pandas.DataFrame` constructor, single values are not allowed here. Passing e.g. an integer is an error, even if there is only one column). At least one of rows and columns must be provided. If both are provided then the generated rows will be validated against the columns and an error will be raised if they don't match. Caveats on using rows: * In general you should prefer using columns to rows, and only use rows if the columns interface is insufficiently flexible to describe what you need - you will get better performance and example quality that way. * If you provide rows and not columns, then the shape and dtype of the resulting DataFrame may vary. e.g. if you have a mix of int and float in the values for one column in your row entries, the column will sometimes have an integral dtype and sometimes a float. * index: If not None, a strategy for generating indexes for the resulting DataFrame. This can generate either :class:`pandas.Index` objects or any sequence of values (which will be passed to the Index constructor). You will probably find it most convenient to use the :func:`~hypothesis.extra.pandas.indexes` or :func:`~hypothesis.extra.pandas.range_indexes` function to produce values for this argument. Usage: The expected usage pattern is that you use :class:`column` and :func:`columns` to specify a fixed shape of the DataFrame you want as follows. For example the following gives a two column data frame: .. code-block:: pycon >>> from hypothesis.extra.pandas import column, data_frames >>> data_frames([ ... column('A', dtype=int), column('B', dtype=float)]).example() A B 0 2021915903 1.793898e+232 1 1146643993 inf 2 -2096165693 1.000000e+07 If you want the values in different columns to interact in some way you can use the rows argument. For example the following gives a two column DataFrame where the value in the first column is always at most the value in the second: .. code-block:: pycon >>> from hypothesis.extra.pandas import column, data_frames >>> import hypothesis.strategies as st >>> data_frames( ... rows=st.tuples(st.floats(allow_nan=False), ... st.floats(allow_nan=False)).map(sorted) ... ).example() 0 1 0 -3.402823e+38 9.007199e+15 1 -1.562796e-298 5.000000e-01 You can also combine the two: .. code-block:: pycon >>> from hypothesis.extra.pandas import columns, data_frames >>> import hypothesis.strategies as st >>> data_frames( ... columns=columns(["lo", "hi"], dtype=float), ... rows=st.tuples(st.floats(allow_nan=False), ... st.floats(allow_nan=False)).map(sorted) ... ).example() lo hi 0 9.314723e-49 4.353037e+45 1 -9.999900e-01 1.000000e+07 2 -2.152861e+134 -1.069317e-73 (Note that the column dtype must still be specified and will not be inferred from the rows. This restriction may be lifted in future). Combining rows and columns has the following behaviour: * The column names and dtypes will be used. * If the column is required to be unique, this will be enforced. * Any values missing from the generated rows will be provided using the column's fill. * Any values in the row not present in the column specification (if dicts are passed, if there are keys with no corresponding column name, if sequences are passed if there are too many items) will result in InvalidArgument being raised. """ if index is None: index = range_indexes() else: st.check_strategy(index) index_strategy = index if columns is None: if rows is None: raise InvalidArgument( 'At least one of rows and columns must be provided' ) else: @st.composite def rows_only(draw): index = draw(index_strategy) @check_function def row(): result = draw(rows) st.check_type(Iterable, result, 'draw(row)') return result if len(index) > 0: return pandas.DataFrame( [row() for _ in index], index=index ) else: # If we haven't drawn any rows we need to draw one row and # then discard it so that we get a consistent shape for the # DataFrame. base = pandas.DataFrame([row()]) return base.drop(0) return rows_only() assert columns is not None columns = st.try_convert(tuple, columns, 'columns') rewritten_columns = [] column_names = set() for i, c in enumerate(columns): st.check_type(column, c, 'columns[%d]' % (i,)) c = copy(c) if c.name is None: label = 'columns[%d]' % (i,) c.name = i else: label = c.name try: hash(c.name) except TypeError: raise InvalidArgument( 'Column names must be hashable, but columns[%d].name was ' '%r of type %s, which cannot be hashed.' % ( i, c.name, type(c.name).__name__,)) if c.name in column_names: raise InvalidArgument( 'duplicate definition of column name %r' % (c.name,)) column_names.add(c.name) c.elements, c.dtype = elements_and_dtype( c.elements, c.dtype, label ) if c.dtype is None and rows is not None: raise InvalidArgument( 'Must specify a dtype for all columns when combining rows with' ' columns.' ) c.fill = npst.fill_for( fill=c.fill, elements=c.elements, unique=c.unique, name=label ) rewritten_columns.append(c) if rows is None: @st.composite def just_draw_columns(draw): index = draw(index_strategy) local_index_strategy = st.just(index) data = OrderedDict((c.name, None) for c in rewritten_columns) # Depending on how the columns are going to be generated we group # them differently to get better shrinking. For columns with fill # enabled, the elements can be shrunk independently of the size, # so we can just shrink by shrinking the index then shrinking the # length and are generally much more free to move data around. # For columns with no filling the problem is harder, and drawing # them like that would result in rows being very far apart from # each other in the underlying data stream, which gets in the way # of shrinking. So what we do is reorder and draw those columns # row wise, so that the values of each row are next to each other. # This makes life easier for the shrinker when deleting blocks of # data. columns_without_fill = [ c for c in rewritten_columns if c.fill.is_empty] if columns_without_fill: for c in columns_without_fill: data[c.name] = pandas.Series( np.zeros(shape=len(index), dtype=c.dtype), index=index, ) seen = { c.name: set() for c in columns_without_fill if c.unique} for i in hrange(len(index)): for c in columns_without_fill: if c.unique: for _ in range(5): value = draw(c.elements) if value not in seen[c.name]: seen[c.name].add(value) break else: reject() else: value = draw(c.elements) data[c.name][i] = value for c in rewritten_columns: if not c.fill.is_empty: data[c.name] = draw(series( index=local_index_strategy, dtype=c.dtype, elements=c.elements, fill=c.fill, unique=c.unique)) return pandas.DataFrame(data, index=index) return just_draw_columns() else: @st.composite def assign_rows(draw): index = draw(index_strategy) result = pandas.DataFrame(OrderedDict( (c.name, pandas.Series( np.zeros(dtype=c.dtype, shape=len(index)), dtype=c.dtype)) for c in rewritten_columns ), index=index) fills = {} any_unique = any(c.unique for c in rewritten_columns) if any_unique: all_seen = [ set() if c.unique else None for c in rewritten_columns] while all_seen[-1] is None: all_seen.pop() for row_index in hrange(len(index)): for _ in hrange(5): original_row = draw(rows) row = original_row if isinstance(row, dict): as_list = [None] * len(rewritten_columns) for i, c in enumerate(rewritten_columns): try: as_list[i] = row[c.name] except KeyError: try: as_list[i] = fills[i] except KeyError: fills[i] = draw(c.fill) as_list[i] = fills[i] for k in row: if k not in column_names: raise InvalidArgument(( 'Row %r contains column %r not in ' 'columns %r)' % ( row, k, [ c.name for c in rewritten_columns ]))) row = as_list if any_unique: has_duplicate = False for seen, value in zip(all_seen, row): if seen is None: continue if value in seen: has_duplicate = True break seen.add(value) if has_duplicate: continue row = list(st.try_convert(tuple, row, 'draw(rows)')) if len(row) > len(rewritten_columns): raise InvalidArgument(( 'Row %r contains too many entries. Has %d but ' 'expected at most %d') % ( original_row, len(row), len(rewritten_columns) )) while len(row) < len(rewritten_columns): row.append(draw(rewritten_columns[len(row)].fill)) result.iloc[row_index] = row break else: reject() return result return assign_rows()