Example #1
0
def test_is_extension_type():
    assert not com.is_extension_type([1, 2, 3])
    assert not com.is_extension_type(np.array([1, 2, 3]))
    assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3]))

    cat = pd.Categorical([1, 2, 3])
    assert com.is_extension_type(cat)
    assert com.is_extension_type(pd.Series(cat))
    assert com.is_extension_type(pd.SparseArray([1, 2, 3]))
    assert com.is_extension_type(pd.SparseSeries([1, 2, 3]))
    assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))

    dtype = DatetimeTZDtype("ns", tz="US/Eastern")
    s = pd.Series([], dtype=dtype)
    assert com.is_extension_type(s)

    # This test will only skip if the previous assertions
    # pass AND scipy is not installed.
    sparse = pytest.importorskip("scipy.sparse")
    assert not com.is_extension_type(sparse.bsr_matrix([1, 2, 3]))
Example #2
0
File: utils.py Project: z7ye/dask-1
def _nonempty_series(s, idx=None):
    # TODO: Use register dtypes with make_array_nonempty
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp("1970-01-01", tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        if len(s.cat.categories):
            data = [s.cat.categories[0]] * 2
            cats = s.cat.categories
        else:
            data = _nonempty_index(s.cat.categories)
            cats = s.cat.categories[:0]
        data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered)
    elif is_integer_na_dtype(dtype):
        data = pd.array([1, None], dtype=dtype)
    elif is_period_dtype(dtype):
        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
        freq = dtype.freq
        data = [pd.Period("2000", freq), pd.Period("2001", freq)]
    elif is_sparse(dtype):
        entry = _scalar_from_dtype(dtype.subtype)
        if PANDAS_GT_100:
            data = pd.array([entry, entry], dtype=dtype)
        else:
            data = pd.SparseArray([entry, entry], dtype=dtype)
    elif is_interval_dtype(dtype):
        entry = _scalar_from_dtype(dtype.subtype)
        data = pd.array([entry, entry], dtype=dtype)
    elif type(dtype) in make_array_nonempty._lookup:
        data = make_array_nonempty(dtype)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    out = pd.Series(data, name=s.name, index=idx)
    if PANDAS_GT_100:
        out.attrs = s.attrs
    return out
Example #3
0
 def test_pandas_sparse(self):
     import pandas as pd
     X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
                       "B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
                       "C": pd.SparseArray(np.random.permutation([True, False] * 150))})
     y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
     X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
                            "B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
                            "C": pd.SparseArray(np.random.permutation([True, False] * 30))})
     if pd.__version__ >= '0.24.0':
         for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
             self.assertTrue(pd.api.types.is_sparse(dtype))
     gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
     pred_sparse = gbm.predict(X_test, raw_score=True)
     if hasattr(X_test, 'sparse'):
         pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
     else:
         pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
     np.testing.assert_allclose(pred_sparse, pred_dense)
Example #4
0
def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc):
    # Test that
    #   * ufunc(Series, scalar) == Series(ufunc(array, scalar))
    #   * ufunc(Series, scalar) == ufunc(scalar, Series)
    array, _ = arrays_for_binary_ufunc
    if sparse:
        array = pd.SparseArray(array)
    other = 2
    series = pd.Series(array, name="name")

    series_args = (series, other)
    array_args = (array, other)

    if flip:
        series_args = tuple(reversed(series_args))
        array_args = tuple(reversed(array_args))

    expected = pd.Series(ufunc(*array_args), name="name")
    result = ufunc(*series_args)

    tm.assert_series_equal(result, expected)
Example #5
0
def test_is_extension_array_dtype(check_scipy):
    assert not com.is_extension_array_dtype([1, 2, 3])
    assert not com.is_extension_array_dtype(np.array([1, 2, 3]))
    assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3]))

    cat = pd.Categorical([1, 2, 3])
    assert com.is_extension_array_dtype(cat)
    assert com.is_extension_array_dtype(pd.Series(cat))
    assert com.is_extension_array_dtype(pd.SparseArray([1, 2, 3]))
    assert com.is_extension_array_dtype(
        pd.DatetimeIndex(["2000"], tz="US/Eastern"))

    dtype = DatetimeTZDtype("ns", tz="US/Eastern")
    s = pd.Series([], dtype=dtype)
    assert com.is_extension_array_dtype(s)

    if check_scipy:
        import scipy.sparse

        assert not com.is_extension_array_dtype(
            scipy.sparse.bsr_matrix([1, 2, 3]))
Example #6
0
    def test_constructor_preserve_attr(self):
        # GH 13866
        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
        assert arr.dtype == np.int64
        assert arr.fill_value == 0

        df = pd.SparseDataFrame({'x': arr})
        assert df['x'].dtype == np.int64
        assert df['x'].fill_value == 0

        s = pd.SparseSeries(arr, name='x')
        assert s.dtype == np.int64
        assert s.fill_value == 0

        df = pd.SparseDataFrame(s)
        assert df['x'].dtype == np.int64
        assert df['x'].fill_value == 0

        df = pd.SparseDataFrame({'x': s})
        assert df['x'].dtype == np.int64
        assert df['x'].fill_value == 0
Example #7
0
    def test_constructor_preserve_attr(self):
        # GH 13866
        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
        self.assertEqual(arr.dtype, np.int64)
        self.assertEqual(arr.fill_value, 0)

        df = pd.SparseDataFrame({'x': arr})
        self.assertEqual(df['x'].dtype, np.int64)
        self.assertEqual(df['x'].fill_value, 0)

        s = pd.SparseSeries(arr, name='x')
        self.assertEqual(s.dtype, np.int64)
        self.assertEqual(s.fill_value, 0)

        df = pd.SparseDataFrame(s)
        self.assertEqual(df['x'].dtype, np.int64)
        self.assertEqual(df['x'].fill_value, 0)

        df = pd.SparseDataFrame({'x': s})
        self.assertEqual(df['x'].dtype, np.int64)
        self.assertEqual(df['x'].fill_value, 0)
def load_dict(file_name: str, tokenizer: "spacy.tokenizer.Tokenizer"):
    """
    Load a SystemT-format dictionary file. File format is one entry per line.

    Tokenizes and normalizes the dictionary entries.

    :param file_name: Path to dictionary file

    :param tokenizer: Preconfigured tokenizer object for tokenizing
    dictionary entries.  **Must be the same configuration as the tokenizer
    used on the target text!**

    :return: a `pd.DataFrame` with the normalized entries.
    """
    with open(file_name, "r") as f:
        lines = [
            line.strip() for line in f.readlines()
            if len(line) > 0 and line[0] != "#"
        ]

    # Tokenize with SpaCy. Produces a SpaCy document object per line.
    tokenized_entries = [tokenizer(line.lower()) for line in lines]

    # Determine the number of tokens in the longest dictionary entry.
    max_num_toks = max([len(e) for e in tokenized_entries])

    # Generate a column for each token. Go one past the max number of tokens so
    # that every dictionary entry ends up None-terminated.
    cols_dict = {}
    for i in range(max_num_toks + 1):
        # Extract token i from every entry that has a token i
        toks_list = [
            e[i].text if len(e) > i else None for e in tokenized_entries
        ]
        cols_dict["toks_{}".format(i)] = (
            # Sparse storage for tokens 2 and onward
            toks_list if i == 0 or not _SPARSE_DICT_ENTRIES else
            pd.SparseArray(toks_list))

    return pd.DataFrame(cols_dict)
Example #9
0
def create_dict(entries: Iterable[str],
                tokenizer: "spacy.tokenizer.Tokenizer" = None) -> pd.DataFrame:
    """
    Create a dictionary from a list of entries, where each entry is expressed as a
    single string.

    Tokenizes and normalizes the dictionary entries.

    :param entries: Iterable of strings, one string per dictionary entry.

    :param tokenizer: Preconfigured tokenizer object for tokenizing
    dictionary entries.  **Must always tokenize the same way as the tokenizer
    used on the target text!**  If None, this method will use tokenizer returned by
    :func:`text_extensions_for_pandas.io.spacy.simple_tokenizer()`.

    :return: a `pd.DataFrame` with the normalized, tokenized dictionary entries.
    """
    if tokenizer is None:
        tokenizer = simple_tokenizer()

    # Tokenize with SpaCy. Produces a SpaCy document object per line.
    tokenized_entries = [tokenizer(entry.lower()) for entry in entries]

    # Determine the number of tokens in the longest dictionary entry.
    max_num_toks = max([len(e) for e in tokenized_entries])

    # Generate a column for each token. Go one past the max number of tokens so
    # that every dictionary entry ends up None-terminated.
    cols_dict = {}
    for i in range(max_num_toks + 1):
        # Extract token i from every entry that has a token i
        toks_list = [
            e[i].text if len(e) > i else None for e in tokenized_entries
        ]
        cols_dict["toks_{}".format(i)] = (
            # Sparse storage for tokens 2 and onward
            toks_list if i == 0 or not _SPARSE_DICT_ENTRIES else
            pd.SparseArray(toks_list))

    return pd.DataFrame(cols_dict)
Example #10
0
    def test_concat_mixed_dtypes(self, data):
        # https://github.com/pandas-dev/pandas/issues/20762
        df1 = pd.DataFrame({'A': data[:3]})
        df2 = pd.DataFrame({"A": [1, 2, 3]})
        df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
        df4 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])})
        dfs = [df1, df2, df3, df4]

        # dataframes
        result = pd.concat(dfs)
        expected = pd.concat([x.astype(object) for x in dfs])
        self.assert_frame_equal(result, expected)

        # series
        result = pd.concat([x['A'] for x in dfs])
        expected = pd.concat([x['A'].astype(object) for x in dfs])
        self.assert_series_equal(result, expected)

        # simple test for just EA and one other
        result = pd.concat([df1, df2])
        expected = pd.concat([df1.astype('object'), df2.astype('object')])
        self.assert_frame_equal(result, expected)
Example #11
0
    def _compare_other(self, s, data, op_name, other):
        op = self.get_op_from_name(op_name)

        # array
        result = pd.Series(op(data, other))
        # hard to test the fill value, since we don't know what expected
        # is in general.
        # Rely on tests in `tests/sparse` to validate that.
        assert isinstance(result.dtype, SparseDtype)
        assert result.dtype.subtype == np.dtype('bool')

        with np.errstate(all='ignore'):
            expected = pd.Series(
                pd.SparseArray(op(np.asarray(data), np.asarray(other)),
                               fill_value=result.values.fill_value))

        tm.assert_series_equal(result, expected)

        # series
        s = pd.Series(data)
        result = op(s, other)
        tm.assert_series_equal(result, expected)
Example #12
0
class TestABCClasses(object):
    tuples = [[1, 2, 2], ['red', 'blue', 'red']]
    multi_index = pd.MultiIndex.from_arrays(tuples, names=('number', 'color'))
    datetime_index = pd.to_datetime(['2000/1/1', '2010/1/1'])
    timedelta_index = pd.to_timedelta(np.arange(5), unit='s')
    period_index = pd.period_range('2000/1/1', '2010/1/1/', freq='M')
    categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
    categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
    df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
    sparse_series = pd.Series([1, 2, 3]).to_sparse()
    sparse_array = pd.SparseArray(np.random.randn(10))

    def test_abc_types(self):
        assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
        assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index)
        assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index)
        assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index)
        assert isinstance(self.multi_index, gt.ABCMultiIndex)
        assert isinstance(self.datetime_index, gt.ABCDatetimeIndex)
        assert isinstance(self.timedelta_index, gt.ABCTimedeltaIndex)
        assert isinstance(self.period_index, gt.ABCPeriodIndex)
        assert isinstance(self.categorical_df.index, gt.ABCCategoricalIndex)
        assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndexClass)
        assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass)
        assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries)
        assert isinstance(self.df, gt.ABCDataFrame)
        with catch_warnings(record=True):
            assert isinstance(self.df.to_panel(), gt.ABCPanel)
        assert isinstance(self.sparse_series, gt.ABCSparseSeries)
        assert isinstance(self.sparse_array, gt.ABCSparseArray)
        assert isinstance(self.categorical, gt.ABCCategorical)
        assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod)

        assert isinstance(pd.DateOffset(), gt.ABCDateOffset)
        assert isinstance(pd.Period('2012', freq='A-DEC').freq,
                          gt.ABCDateOffset)
        assert not isinstance(pd.Period('2012', freq='A-DEC'),
                              gt.ABCDateOffset)
Example #13
0
    def test_subclass_sparse_to_frame(self):
        s = tm.SubclassedSparseSeries([1, 2], index=list('ab'), name='xxx')
        res = s.to_frame()

        exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block',
                                 fill_value=0)
        exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr},
                                           index=list('ab'),
                                           default_fill_value=0)
        tm.assert_sp_frame_equal(res, exp)

        # create from int dict
        res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]},
                                           index=list('ab'),
                                           default_fill_value=0)
        tm.assert_sp_frame_equal(res, exp)

        s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('ab'),
                                      name='xxx')
        res = s.to_frame()
        exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]},
                                           index=list('ab'))
        tm.assert_sp_frame_equal(res, exp)
Example #14
0
    def test_loc(self):
        # need to be override to use different label
        orig = self.orig
        sparse = self.sparse

        tm.assert_sp_series_equal(sparse.loc['A'],
                                  orig.loc['A'].to_sparse())
        tm.assert_sp_series_equal(sparse.loc['B'],
                                  orig.loc['B'].to_sparse())

        result = sparse.loc[[1, 3, 4]]
        exp = orig.loc[[1, 3, 4]].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # exceeds the bounds
        result = sparse.loc[[1, 3, 4, 5]]
        exp = orig.loc[[1, 3, 4, 5]].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # single element list (GH 15447)
        result = sparse.loc[['A']]
        exp = orig.loc[['A']].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # dense array
        result = sparse.loc[orig % 2 == 1]
        exp = orig.loc[orig % 2 == 1].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # sparse array (actuary it coerces to normal Series)
        result = sparse.loc[sparse % 2 == 1]
        exp = orig.loc[orig % 2 == 1].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # sparse array
        result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
        tm.assert_sp_series_equal(result, exp)
Example #15
0
    def _convert_arff_coo(features, columns, arff_data_data):
        if features is None:
            data = [([], []) for _ in columns]
        else:
            fset = remove_dups_from_list(features)
            data = [([], []) if c in fset else None for c in columns]

        for v, i, j in zip(*arff_data_data):
            d = data[j]
            if d is not None:
                indices, values = d
                if indices:
                    assert indices[-1] < i
                indices.append(i)
                values.append(v)

        max_i = -1
        for d in data:
            if d is not None and len(d[0]) > 0:
                max_i = max(max_i, d[0][-1])
        height = max_i + 1

        series = []
        for d in data:
            if d is None:
                s = None
            else:
                keys, values = d
                sa = pd.SparseArray(values,
                                    sparse_index=pd._libs.sparse.IntIndex(
                                        height, keys),
                                    fill_value=0)
                s = pd.Series(sa.values)
            series.append(s)

        return series
Example #16
0
    def test_loc_index(self):
        orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE"))
        sparse = orig.to_sparse()

        assert sparse.loc["A"] == 1
        assert np.isnan(sparse.loc["B"])

        result = sparse.loc[["A", "C", "D"]]
        exp = orig.loc[["A", "C", "D"]].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # dense array
        result = sparse.loc[orig % 2 == 1]
        exp = orig.loc[orig % 2 == 1].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # sparse array (actuary it coerces to normal Series)
        result = sparse.loc[sparse % 2 == 1]
        exp = orig.loc[orig % 2 == 1].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # sparse array
        result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
        tm.assert_sp_series_equal(result, exp)
Example #17
0
    def test_loc_index(self):
        orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE'))
        sparse = orig.to_sparse()

        self.assertEqual(sparse.loc['A'], 1)
        self.assertTrue(np.isnan(sparse.loc['B']))

        result = sparse.loc[['A', 'C', 'D']]
        exp = orig.loc[['A', 'C', 'D']].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # dense array
        result = sparse.loc[orig % 2 == 1]
        exp = orig.loc[orig % 2 == 1].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # sparse array (actuary it coerces to normal Series)
        result = sparse.loc[sparse % 2 == 1]
        exp = orig.loc[orig % 2 == 1].to_sparse()
        tm.assert_sp_series_equal(result, exp)

        # sparse array
        result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
        tm.assert_sp_series_equal(result, exp)
Example #18
0
def test_is_bool_dtype_sparse():
    result = is_bool_dtype(pd.Series(pd.SparseArray([True, False])))
    assert result is True
Example #19
0
def test_registry_find(dtype, expected):
    assert registry.find(dtype) == expected


@pytest.mark.parametrize(
    "dtype, expected",
    [
        (str, False),
        (int, False),
        (bool, True),
        (np.bool, True),
        (np.array(["a", "b"]), False),
        (pd.Series([1, 2]), False),
        (np.array([True, False]), True),
        (pd.Series([True, False]), True),
        (pd.SparseArray([True, False]), True),
        (SparseDtype(bool), True),
    ],
)
def test_is_bool_dtype(dtype, expected):
    result = is_bool_dtype(dtype)
    assert result is expected


def test_is_bool_dtype_sparse():
    result = is_bool_dtype(pd.Series(pd.SparseArray([True, False])))
    assert result is True


@pytest.mark.parametrize(
    "check",
Example #20
0
def test_is_scipy_sparse():
    from scipy.sparse import bsr_matrix
    assert com.is_scipy_sparse(bsr_matrix([1, 2, 3]))

    assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3]))
    assert not com.is_scipy_sparse(pd.SparseSeries([1, 2, 3]))
Example #21
0
 def test_boolean_slice_empty(self):
     arr = pd.SparseArray([0, 1, 2])
     res = arr[[False, False, False]]
     assert res.dtype == arr.dtype
Example #22
0
def test_unique_na_fill(arr, fill_value):
    a = pd.SparseArray(arr, fill_value=fill_value).unique()
    b = pd.Series(arr).unique()
    assert isinstance(a, SparseArray)
    a = np.asarray(a)
    tm.assert_numpy_array_equal(a, b)
Example #23
0
 def test_asarray_datetime64(self):
     s = pd.SparseArray(pd.to_datetime(['2012', None, None, '2013']))
     np.asarray(s)
Example #24
0
     pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),

    (pd.TimedeltaIndex(['1H', '2H']), None,
     pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),

    # Category
    (['a', 'b'], 'category', pd.Categorical(['a', 'b'])),
    (['a', 'b'], pd.CategoricalDtype(None, ordered=True),
     pd.Categorical(['a', 'b'], ordered=True)),

    # Interval
    ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval',
     pd.IntervalArray.from_tuples([(1, 2), (3, 4)])),

    # Sparse
    ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')),

    # IntegerNA
    ([1, None], 'Int16', integer_array([1, None], dtype='Int16')),
    (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),

    # Index
    (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),

    # Series[EA] returns the EA
    (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])),
     None,
     pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])),

    # "3rd party" EAs work
    ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])),
Example #25
0
def lookup_sv_counts(spec_sv_tup):
    spec_idx, ordered_sv_idx, seqtab_T = spec_sv_tup
    return pd.SparseArray(
        [seqtab_T.iat[sv_idx, spec_idx] for sv_idx in ordered_sv_idx])
Example #26
0
 def test_asarray_datetime64(self):
     s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"]))
     np.asarray(s)
Example #27
0
    def test_loc(self):
        orig = pd.DataFrame([[1, np.nan, np.nan],
                             [2, 3, np.nan],
                             [np.nan, np.nan, 4]],
                            columns=list('xyz'))
        sparse = orig.to_sparse()

        assert sparse.loc[0, 'x'] == 1
        assert np.isnan(sparse.loc[1, 'z'])
        assert sparse.loc[2, 'z'] == 4

        # have to specify `kind='integer'`, since we construct a
        # new SparseArray here, and the default sparse type is
        # integer there, but block in SparseSeries
        tm.assert_sp_series_equal(sparse.loc[0],
                                  orig.loc[0].to_sparse(kind='integer'))
        tm.assert_sp_series_equal(sparse.loc[1],
                                  orig.loc[1].to_sparse(kind='integer'))
        tm.assert_sp_series_equal(sparse.loc[2, :],
                                  orig.loc[2, :].to_sparse(kind='integer'))
        tm.assert_sp_series_equal(sparse.loc[2, :],
                                  orig.loc[2, :].to_sparse(kind='integer'))
        tm.assert_sp_series_equal(sparse.loc[:, 'y'],
                                  orig.loc[:, 'y'].to_sparse())
        tm.assert_sp_series_equal(sparse.loc[:, 'y'],
                                  orig.loc[:, 'y'].to_sparse())

        result = sparse.loc[[1, 2]]
        exp = orig.loc[[1, 2]].to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        result = sparse.loc[[1, 2], :]
        exp = orig.loc[[1, 2], :].to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        result = sparse.loc[:, ['x', 'z']]
        exp = orig.loc[:, ['x', 'z']].to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        result = sparse.loc[[0, 2], ['x', 'z']]
        exp = orig.loc[[0, 2], ['x', 'z']].to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        # exceeds the bounds
        result = sparse.reindex([1, 3, 4, 5])
        exp = orig.reindex([1, 3, 4, 5]).to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        # dense array
        result = sparse.loc[orig.x % 2 == 1]
        exp = orig.loc[orig.x % 2 == 1].to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        # sparse array (actuary it coerces to normal Series)
        result = sparse.loc[sparse.x % 2 == 1]
        exp = orig.loc[orig.x % 2 == 1].to_sparse()
        tm.assert_sp_frame_equal(result, exp)

        # sparse array
        result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)]
        tm.assert_sp_frame_equal(result, exp)
Example #28
0
def main():
    args_parser = argparse.ArgumentParser(
        description="""A small utility to convert dada2 style
    seqtables to a MOTHUR style sharetable and/or pplacer-style map and weights files.
    """)

    args_parser.add_argument('--seqtable',
                             '-s',
                             help="Sequence table from dada2, in CSV format",
                             required=True,
                             type=argparse.FileType('r'))
    args_parser.add_argument(
        '--fasta_out_sequences',
        '-f',
        help="Write sequence variants to this file, in FASTA format",
        type=argparse.FileType('w'))
    args_parser.add_argument(
        '--map',
        '-m',
        help="Write pplacer-style mapping of sv to specimen",
        type=argparse.FileType('w'))
    args_parser.add_argument(
        '--weights',
        '-w',
        help="Write pplacer-style weights of sv by specimen",
        type=argparse.FileType('w'))
    args_parser.add_argument(
        '--long',
        '-L',
        help="Write out specimen, sv_id, count in long format",
        type=argparse.FileType('w'))
    args_parser.add_argument(
        '--sharetable',
        '-t',
        help="Write mothur-style sharetable to this location",
        type=argparse.FileType('w'))
    args_parser.add_argument(
        '--cpus',
        '-C',
        help="Number of threads to use. Default is number of vCPU available",
        type=int,
        default=None)

    args = args_parser.parse_args()

    # Check to see if we've been tasked with anything. If not, we have nothing to do and should exit
    if not (args.fasta_out_sequences or args.map or args.weights
            or args.sharetable or args.long):
        sys.exit("Nothing to do")

    # Just convert our handles over to something nicer
    if args.fasta_out_sequences:
        out_sv_seqs_h = args.fasta_out_sequences
    else:
        out_sv_seqs_h = None
    if args.map:
        out_map_h = args.map
        map_writer = csv.writer(out_map_h)
    else:
        map_writer = None
    if args.weights:
        out_weights_h = args.weights
        weights_writer = csv.writer(out_weights_h)
    else:
        weights_writer = None
    if args.long:
        long_writer = csv.writer(args.long)
        # Header
        long_writer.writerow(['specimen', 'sv', 'count'])
    else:
        long_writer = None
    if args.sharetable:
        sharetable_fn = args.sharetable
    else:
        sharetable_fn = None
    logging.info("Loading DADA2 seqtable")
    # Load the sequence table

    # Reduce memory use by streaming in and using sparse structures
    seqtab_T = pd.DataFrame()

    seqtab_reader = csv.reader(args.seqtable)
    # Get the header, which are the SV sequences themselves
    sv_header = next(seqtab_reader)[1:]
    for r in seqtab_reader:
        specimen = r[0]
        counts = [int(c) for c in r[1:]]
        seqtab_T[specimen] = pd.SparseArray(counts, dtype=int, fill_value=0)
    logging.info("DADA2 Seqtable loaded")
    # Order the SV via their mean relative abundance\
    logging.info("Ordering SV by mean relative abundance")
    ordered_sv_idx = list(
        (seqtab_T /
         seqtab_T.sum(axis=0)).mean(axis=1).sort_values(ascending=False).index)

    # Generate sv labels for each sequence variant,
    # and generate a dictionary to map sv_id to sequence-variant
    logging.info("Generating SV names")
    seq_idx_to_sv_num = {
        idx: 'sv-%d' % (i + 1)
        for i, idx in enumerate(ordered_sv_idx)
    }
    # Transpose, Reorder and Rename into a new seqtab
    logging.info("Generating new seqtable")
    convert_pool = Pool(args.cpus)
    num_specimens = len(seqtab_T.columns)
    seqtab_reorder = pd.DataFrame(
        convert_pool.imap(
            lookup_sv_counts,
            zip(range(num_specimens), [ordered_sv_idx] * num_specimens,
                [seqtab_T] * num_specimens)),
        columns=[seq_idx_to_sv_num[sv_idx] for sv_idx in ordered_sv_idx],
        index=seqtab_T.columns)
    logging.info("Reordered seqtable done")

    # Annoyingly, we need to pick a representitive actual sequence
    # from each sv to be it's champion for guppy.
    # To do so, we will go through each column, find the max count for that sv,
    # and use that specimen as the champion
    logging.info("Finding maximum specimen for each SV")
    max_spec_for_sv = {
        sv_id: spec
        for sv_id, spec in seqtab_reorder.apply(lambda c: c.idxmax()).items()
    }

    if out_sv_seqs_h is not None:
        # Write out the sequences in fasta format, using the sv-id's generated above as an ID
        logging.info("Writing out SV to FASTA")
        for sv_idx in ordered_sv_idx:
            out_sv_seqs_h.write(">%s:%s\n%s\n" %
                                (seq_idx_to_sv_num[sv_idx],
                                 max_spec_for_sv[seq_idx_to_sv_num[sv_idx]],
                                 sv_header[sv_idx]))

    # Now write the mapping and weights files
    # Both are headerless CSV format files
    # map: sequence_id (sv_id:specimen), specimen
    # weight: sequence_id (sv_id here), specimen_sequence_id (sv_id:specimen here), count
    # This is a bit of a clunky structure (relating to some historic cruft)

    if map_writer or weights_writer or long_writer:
        logging.info("Writing out long, map, and/or weights")
        for spec, row in seqtab_reorder.iterrows():
            row_nonzero = row[row > 0]
            for sv_id, count in row_nonzero.items():
                if map_writer is not None:
                    map_writer.writerow([str(sv_id) + ":" + str(spec), spec])
                if weights_writer is not None:
                    weights_writer.writerow([
                        sv_id + ":" + max_spec_for_sv[sv_id],
                        str(sv_id) + ":" + str(spec), count
                    ])
                if long_writer is not None:
                    long_writer.writerow(
                        [spec, sv_id + ":" + max_spec_for_sv[sv_id], count])

    if sharetable_fn is not None:
        sharetable_labels = pd.DataFrame()
        sharetable_labels['label'] = list(seqtab_reorder.index)
        sharetable_labels['group'] = "dada2"
        sharetable_labels['numsvs'] = len(seqtab_reorder.columns)
        sharetable_labels.head()
        pd.merge(sharetable_labels,
                 seqtab_reorder,
                 left_on='label',
                 right_index=True).to_csv(sharetable_fn, index=False, sep='\t')

    # Cleanup.
    if out_sv_seqs_h:
        out_sv_seqs_h.close()
    if map_writer:
        out_map_h.close()
    if weights_writer:
        out_weights_h.close()
Example #29
0
 ),
 # Category
 (["a", "b"], "category", pd.Categorical(["a", "b"])),
 (
     ["a", "b"],
     pd.CategoricalDtype(None, ordered=True),
     pd.Categorical(["a", "b"], ordered=True),
 ),
 # Interval
 (
     [pd.Interval(1, 2), pd.Interval(3, 4)],
     "interval",
     pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]),
 ),
 # Sparse
 ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")),
 # IntegerNA
 ([1, None], "Int16", integer_array([1, None], dtype="Int16")),
 (pd.Series([1, 2]), None, PandasArray(np.array([1, 2],
                                                dtype=np.int64))),
 # Index
 (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))
  ),
 # Series[EA] returns the EA
 (
     pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
     None,
     pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
 ),
 # "3rd party" EAs work
 ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal(
Example #30
0
def test_ndarray_values(array, expected):
    l_values = pd.Series(array)._ndarray_values
    r_values = pd.Index(array)._ndarray_values
    tm.assert_numpy_array_equal(l_values, r_values)
    tm.assert_numpy_array_equal(l_values, expected)


@pytest.mark.parametrize(
    "array, attr",
    [
        (np.array([1, 2], dtype=np.int64), None),
        (pd.Categorical(['a', 'b']), '_codes'),
        (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'),
        (pd.core.arrays.integer_array([0, np.nan]), '_data'),
        (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'),
        (pd.SparseArray([0, 1]), '_sparse_values'),
        # TODO: DatetimeArray(add)
    ])
@pytest.mark.parametrize('box', [pd.Series, pd.Index])
def test_array(array, attr, box):
    if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index:
        pytest.skip("No index type for {}".format(array.dtype))
    result = box(array, copy=False).array

    if attr:
        array = getattr(array, attr)
        result = getattr(result, attr)

    assert result is array