def test_is_extension_type(): assert not com.is_extension_type([1, 2, 3]) assert not com.is_extension_type(np.array([1, 2, 3])) assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3])) cat = pd.Categorical([1, 2, 3]) assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) assert com.is_extension_type(pd.SparseArray([1, 2, 3])) assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") s = pd.Series([], dtype=dtype) assert com.is_extension_type(s) # This test will only skip if the previous assertions # pass AND scipy is not installed. sparse = pytest.importorskip("scipy.sparse") assert not com.is_extension_type(sparse.bsr_matrix([1, 2, 3]))
def _nonempty_series(s, idx=None): # TODO: Use register dtypes with make_array_nonempty if idx is None: idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp("1970-01-01", tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): data = [s.cat.categories[0]] * 2 cats = s.cat.categories else: data = _nonempty_index(s.cat.categories) cats = s.cat.categories[:0] data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) elif is_period_dtype(dtype): # pandas 0.24.0+ should infer this to be Series[Period[freq]] freq = dtype.freq data = [pd.Period("2000", freq), pd.Period("2001", freq)] elif is_sparse(dtype): entry = _scalar_from_dtype(dtype.subtype) if PANDAS_GT_100: data = pd.array([entry, entry], dtype=dtype) else: data = pd.SparseArray([entry, entry], dtype=dtype) elif is_interval_dtype(dtype): entry = _scalar_from_dtype(dtype.subtype) data = pd.array([entry, entry], dtype=dtype) elif type(dtype) in make_array_nonempty._lookup: data = make_array_nonempty(dtype) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) out = pd.Series(data, name=s.name, index=idx) if PANDAS_GT_100: out.attrs = s.attrs return out
def test_pandas_sparse(self): import pandas as pd X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)), "B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)), "C": pd.SparseArray(np.random.permutation([True, False] * 150))}) y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150))) X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)), "B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)), "C": pd.SparseArray(np.random.permutation([True, False] * 30))}) if pd.__version__ >= '0.24.0': for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]): self.assertTrue(pd.api.types.is_sparse(dtype)) gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y) pred_sparse = gbm.predict(X_test, raw_score=True) if hasattr(X_test, 'sparse'): pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True) else: pred_dense = gbm.predict(X_test.to_dense(), raw_score=True) np.testing.assert_allclose(pred_sparse, pred_dense)
def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): # Test that # * ufunc(Series, scalar) == Series(ufunc(array, scalar)) # * ufunc(Series, scalar) == ufunc(scalar, Series) array, _ = arrays_for_binary_ufunc if sparse: array = pd.SparseArray(array) other = 2 series = pd.Series(array, name="name") series_args = (series, other) array_args = (array, other) if flip: series_args = tuple(reversed(series_args)) array_args = tuple(reversed(array_args)) expected = pd.Series(ufunc(*array_args), name="name") result = ufunc(*series_args) tm.assert_series_equal(result, expected)
def test_is_extension_array_dtype(check_scipy): assert not com.is_extension_array_dtype([1, 2, 3]) assert not com.is_extension_array_dtype(np.array([1, 2, 3])) assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3])) cat = pd.Categorical([1, 2, 3]) assert com.is_extension_array_dtype(cat) assert com.is_extension_array_dtype(pd.Series(cat)) assert com.is_extension_array_dtype(pd.SparseArray([1, 2, 3])) assert com.is_extension_array_dtype( pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") s = pd.Series([], dtype=dtype) assert com.is_extension_array_dtype(s) if check_scipy: import scipy.sparse assert not com.is_extension_array_dtype( scipy.sparse.bsr_matrix([1, 2, 3]))
def test_constructor_preserve_attr(self): # GH 13866 arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) assert arr.dtype == np.int64 assert arr.fill_value == 0 df = pd.SparseDataFrame({'x': arr}) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 s = pd.SparseSeries(arr, name='x') assert s.dtype == np.int64 assert s.fill_value == 0 df = pd.SparseDataFrame(s) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 df = pd.SparseDataFrame({'x': s}) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0
def test_constructor_preserve_attr(self): # GH 13866 arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) df = pd.SparseDataFrame({'x': arr}) self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0) s = pd.SparseSeries(arr, name='x') self.assertEqual(s.dtype, np.int64) self.assertEqual(s.fill_value, 0) df = pd.SparseDataFrame(s) self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0) df = pd.SparseDataFrame({'x': s}) self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0)
def load_dict(file_name: str, tokenizer: "spacy.tokenizer.Tokenizer"): """ Load a SystemT-format dictionary file. File format is one entry per line. Tokenizes and normalizes the dictionary entries. :param file_name: Path to dictionary file :param tokenizer: Preconfigured tokenizer object for tokenizing dictionary entries. **Must be the same configuration as the tokenizer used on the target text!** :return: a `pd.DataFrame` with the normalized entries. """ with open(file_name, "r") as f: lines = [ line.strip() for line in f.readlines() if len(line) > 0 and line[0] != "#" ] # Tokenize with SpaCy. Produces a SpaCy document object per line. tokenized_entries = [tokenizer(line.lower()) for line in lines] # Determine the number of tokens in the longest dictionary entry. max_num_toks = max([len(e) for e in tokenized_entries]) # Generate a column for each token. Go one past the max number of tokens so # that every dictionary entry ends up None-terminated. cols_dict = {} for i in range(max_num_toks + 1): # Extract token i from every entry that has a token i toks_list = [ e[i].text if len(e) > i else None for e in tokenized_entries ] cols_dict["toks_{}".format(i)] = ( # Sparse storage for tokens 2 and onward toks_list if i == 0 or not _SPARSE_DICT_ENTRIES else pd.SparseArray(toks_list)) return pd.DataFrame(cols_dict)
def create_dict(entries: Iterable[str], tokenizer: "spacy.tokenizer.Tokenizer" = None) -> pd.DataFrame: """ Create a dictionary from a list of entries, where each entry is expressed as a single string. Tokenizes and normalizes the dictionary entries. :param entries: Iterable of strings, one string per dictionary entry. :param tokenizer: Preconfigured tokenizer object for tokenizing dictionary entries. **Must always tokenize the same way as the tokenizer used on the target text!** If None, this method will use tokenizer returned by :func:`text_extensions_for_pandas.io.spacy.simple_tokenizer()`. :return: a `pd.DataFrame` with the normalized, tokenized dictionary entries. """ if tokenizer is None: tokenizer = simple_tokenizer() # Tokenize with SpaCy. Produces a SpaCy document object per line. tokenized_entries = [tokenizer(entry.lower()) for entry in entries] # Determine the number of tokens in the longest dictionary entry. max_num_toks = max([len(e) for e in tokenized_entries]) # Generate a column for each token. Go one past the max number of tokens so # that every dictionary entry ends up None-terminated. cols_dict = {} for i in range(max_num_toks + 1): # Extract token i from every entry that has a token i toks_list = [ e[i].text if len(e) > i else None for e in tokenized_entries ] cols_dict["toks_{}".format(i)] = ( # Sparse storage for tokens 2 and onward toks_list if i == 0 or not _SPARSE_DICT_ENTRIES else pd.SparseArray(toks_list)) return pd.DataFrame(cols_dict)
def test_concat_mixed_dtypes(self, data): # https://github.com/pandas-dev/pandas/issues/20762 df1 = pd.DataFrame({'A': data[:3]}) df2 = pd.DataFrame({"A": [1, 2, 3]}) df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') df4 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])}) dfs = [df1, df2, df3, df4] # dataframes result = pd.concat(dfs) expected = pd.concat([x.astype(object) for x in dfs]) self.assert_frame_equal(result, expected) # series result = pd.concat([x['A'] for x in dfs]) expected = pd.concat([x['A'].astype(object) for x in dfs]) self.assert_series_equal(result, expected) # simple test for just EA and one other result = pd.concat([df1, df2]) expected = pd.concat([df1.astype('object'), df2.astype('object')]) self.assert_frame_equal(result, expected)
def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) # array result = pd.Series(op(data, other)) # hard to test the fill value, since we don't know what expected # is in general. # Rely on tests in `tests/sparse` to validate that. assert isinstance(result.dtype, SparseDtype) assert result.dtype.subtype == np.dtype('bool') with np.errstate(all='ignore'): expected = pd.Series( pd.SparseArray(op(np.asarray(data), np.asarray(other)), fill_value=result.values.fill_value)) tm.assert_series_equal(result, expected) # series s = pd.Series(data) result = op(s, other) tm.assert_series_equal(result, expected)
class TestABCClasses(object): tuples = [[1, 2, 2], ['red', 'blue', 'red']] multi_index = pd.MultiIndex.from_arrays(tuples, names=('number', 'color')) datetime_index = pd.to_datetime(['2000/1/1', '2010/1/1']) timedelta_index = pd.to_timedelta(np.arange(5), unit='s') period_index = pd.period_range('2000/1/1', '2010/1/1/', freq='M') categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index) sparse_series = pd.Series([1, 2, 3]).to_sparse() sparse_array = pd.SparseArray(np.random.randn(10)) def test_abc_types(self): assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index) assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) assert isinstance(self.multi_index, gt.ABCMultiIndex) assert isinstance(self.datetime_index, gt.ABCDatetimeIndex) assert isinstance(self.timedelta_index, gt.ABCTimedeltaIndex) assert isinstance(self.period_index, gt.ABCPeriodIndex) assert isinstance(self.categorical_df.index, gt.ABCCategoricalIndex) assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndexClass) assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass) assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries) assert isinstance(self.df, gt.ABCDataFrame) with catch_warnings(record=True): assert isinstance(self.df.to_panel(), gt.ABCPanel) assert isinstance(self.sparse_series, gt.ABCSparseSeries) assert isinstance(self.sparse_array, gt.ABCSparseArray) assert isinstance(self.categorical, gt.ABCCategorical) assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) assert isinstance(pd.DateOffset(), gt.ABCDateOffset) assert isinstance(pd.Period('2012', freq='A-DEC').freq, gt.ABCDateOffset) assert not isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCDateOffset)
def test_subclass_sparse_to_frame(self): s = tm.SubclassedSparseSeries([1, 2], index=list('ab'), name='xxx') res = s.to_frame() exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block', fill_value=0) exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, index=list('ab'), default_fill_value=0) tm.assert_sp_frame_equal(res, exp) # create from int dict res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, index=list('ab'), default_fill_value=0) tm.assert_sp_frame_equal(res, exp) s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('ab'), name='xxx') res = s.to_frame() exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]}, index=list('ab')) tm.assert_sp_frame_equal(res, exp)
def test_loc(self): # need to be override to use different label orig = self.orig sparse = self.sparse tm.assert_sp_series_equal(sparse.loc['A'], orig.loc['A'].to_sparse()) tm.assert_sp_series_equal(sparse.loc['B'], orig.loc['B'].to_sparse()) result = sparse.loc[[1, 3, 4]] exp = orig.loc[[1, 3, 4]].to_sparse() tm.assert_sp_series_equal(result, exp) # exceeds the bounds result = sparse.loc[[1, 3, 4, 5]] exp = orig.loc[[1, 3, 4, 5]].to_sparse() tm.assert_sp_series_equal(result, exp) # single element list (GH 15447) result = sparse.loc[['A']] exp = orig.loc[['A']].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array result = sparse.loc[orig % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) # sparse array (actuary it coerces to normal Series) result = sparse.loc[sparse % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) # sparse array result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] tm.assert_sp_series_equal(result, exp)
def _convert_arff_coo(features, columns, arff_data_data): if features is None: data = [([], []) for _ in columns] else: fset = remove_dups_from_list(features) data = [([], []) if c in fset else None for c in columns] for v, i, j in zip(*arff_data_data): d = data[j] if d is not None: indices, values = d if indices: assert indices[-1] < i indices.append(i) values.append(v) max_i = -1 for d in data: if d is not None and len(d[0]) > 0: max_i = max(max_i, d[0][-1]) height = max_i + 1 series = [] for d in data: if d is None: s = None else: keys, values = d sa = pd.SparseArray(values, sparse_index=pd._libs.sparse.IntIndex( height, keys), fill_value=0) s = pd.Series(sa.values) series.append(s) return series
def test_loc_index(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() assert sparse.loc["A"] == 1 assert np.isnan(sparse.loc["B"]) result = sparse.loc[["A", "C", "D"]] exp = orig.loc[["A", "C", "D"]].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array result = sparse.loc[orig % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) # sparse array (actuary it coerces to normal Series) result = sparse.loc[sparse % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) # sparse array result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] tm.assert_sp_series_equal(result, exp)
def test_loc_index(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE')) sparse = orig.to_sparse() self.assertEqual(sparse.loc['A'], 1) self.assertTrue(np.isnan(sparse.loc['B'])) result = sparse.loc[['A', 'C', 'D']] exp = orig.loc[['A', 'C', 'D']].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array result = sparse.loc[orig % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) # sparse array (actuary it coerces to normal Series) result = sparse.loc[sparse % 2 == 1] exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) # sparse array result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] tm.assert_sp_series_equal(result, exp)
def test_is_bool_dtype_sparse(): result = is_bool_dtype(pd.Series(pd.SparseArray([True, False]))) assert result is True
def test_registry_find(dtype, expected): assert registry.find(dtype) == expected @pytest.mark.parametrize( "dtype, expected", [ (str, False), (int, False), (bool, True), (np.bool, True), (np.array(["a", "b"]), False), (pd.Series([1, 2]), False), (np.array([True, False]), True), (pd.Series([True, False]), True), (pd.SparseArray([True, False]), True), (SparseDtype(bool), True), ], ) def test_is_bool_dtype(dtype, expected): result = is_bool_dtype(dtype) assert result is expected def test_is_bool_dtype_sparse(): result = is_bool_dtype(pd.Series(pd.SparseArray([True, False]))) assert result is True @pytest.mark.parametrize( "check",
def test_is_scipy_sparse(): from scipy.sparse import bsr_matrix assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) assert not com.is_scipy_sparse(pd.SparseSeries([1, 2, 3]))
def test_boolean_slice_empty(self): arr = pd.SparseArray([0, 1, 2]) res = arr[[False, False, False]] assert res.dtype == arr.dtype
def test_unique_na_fill(arr, fill_value): a = pd.SparseArray(arr, fill_value=fill_value).unique() b = pd.Series(arr).unique() assert isinstance(a, SparseArray) a = np.asarray(a) tm.assert_numpy_array_equal(a, b)
def test_asarray_datetime64(self): s = pd.SparseArray(pd.to_datetime(['2012', None, None, '2013'])) np.asarray(s)
pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), (pd.TimedeltaIndex(['1H', '2H']), None, pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), # Category (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), (['a', 'b'], pd.CategoricalDtype(None, ordered=True), pd.Categorical(['a', 'b'], ordered=True)), # Interval ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', pd.IntervalArray.from_tuples([(1, 2), (3, 4)])), # Sparse ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), # IntegerNA ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), None, pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])),
def lookup_sv_counts(spec_sv_tup): spec_idx, ordered_sv_idx, seqtab_T = spec_sv_tup return pd.SparseArray( [seqtab_T.iat[sv_idx, spec_idx] for sv_idx in ordered_sv_idx])
def test_asarray_datetime64(self): s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"])) np.asarray(s)
def test_loc(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], columns=list('xyz')) sparse = orig.to_sparse() assert sparse.loc[0, 'x'] == 1 assert np.isnan(sparse.loc[1, 'z']) assert sparse.loc[2, 'z'] == 4 # have to specify `kind='integer'`, since we construct a # new SparseArray here, and the default sparse type is # integer there, but block in SparseSeries tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse(kind='integer')) tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse(kind='integer')) tm.assert_sp_series_equal(sparse.loc[2, :], orig.loc[2, :].to_sparse(kind='integer')) tm.assert_sp_series_equal(sparse.loc[2, :], orig.loc[2, :].to_sparse(kind='integer')) tm.assert_sp_series_equal(sparse.loc[:, 'y'], orig.loc[:, 'y'].to_sparse()) tm.assert_sp_series_equal(sparse.loc[:, 'y'], orig.loc[:, 'y'].to_sparse()) result = sparse.loc[[1, 2]] exp = orig.loc[[1, 2]].to_sparse() tm.assert_sp_frame_equal(result, exp) result = sparse.loc[[1, 2], :] exp = orig.loc[[1, 2], :].to_sparse() tm.assert_sp_frame_equal(result, exp) result = sparse.loc[:, ['x', 'z']] exp = orig.loc[:, ['x', 'z']].to_sparse() tm.assert_sp_frame_equal(result, exp) result = sparse.loc[[0, 2], ['x', 'z']] exp = orig.loc[[0, 2], ['x', 'z']].to_sparse() tm.assert_sp_frame_equal(result, exp) # exceeds the bounds result = sparse.reindex([1, 3, 4, 5]) exp = orig.reindex([1, 3, 4, 5]).to_sparse() tm.assert_sp_frame_equal(result, exp) # dense array result = sparse.loc[orig.x % 2 == 1] exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) # sparse array (actuary it coerces to normal Series) result = sparse.loc[sparse.x % 2 == 1] exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) # sparse array result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] tm.assert_sp_frame_equal(result, exp)
def main(): args_parser = argparse.ArgumentParser( description="""A small utility to convert dada2 style seqtables to a MOTHUR style sharetable and/or pplacer-style map and weights files. """) args_parser.add_argument('--seqtable', '-s', help="Sequence table from dada2, in CSV format", required=True, type=argparse.FileType('r')) args_parser.add_argument( '--fasta_out_sequences', '-f', help="Write sequence variants to this file, in FASTA format", type=argparse.FileType('w')) args_parser.add_argument( '--map', '-m', help="Write pplacer-style mapping of sv to specimen", type=argparse.FileType('w')) args_parser.add_argument( '--weights', '-w', help="Write pplacer-style weights of sv by specimen", type=argparse.FileType('w')) args_parser.add_argument( '--long', '-L', help="Write out specimen, sv_id, count in long format", type=argparse.FileType('w')) args_parser.add_argument( '--sharetable', '-t', help="Write mothur-style sharetable to this location", type=argparse.FileType('w')) args_parser.add_argument( '--cpus', '-C', help="Number of threads to use. Default is number of vCPU available", type=int, default=None) args = args_parser.parse_args() # Check to see if we've been tasked with anything. If not, we have nothing to do and should exit if not (args.fasta_out_sequences or args.map or args.weights or args.sharetable or args.long): sys.exit("Nothing to do") # Just convert our handles over to something nicer if args.fasta_out_sequences: out_sv_seqs_h = args.fasta_out_sequences else: out_sv_seqs_h = None if args.map: out_map_h = args.map map_writer = csv.writer(out_map_h) else: map_writer = None if args.weights: out_weights_h = args.weights weights_writer = csv.writer(out_weights_h) else: weights_writer = None if args.long: long_writer = csv.writer(args.long) # Header long_writer.writerow(['specimen', 'sv', 'count']) else: long_writer = None if args.sharetable: sharetable_fn = args.sharetable else: sharetable_fn = None logging.info("Loading DADA2 seqtable") # Load the sequence table # Reduce memory use by streaming in and using sparse structures seqtab_T = pd.DataFrame() seqtab_reader = csv.reader(args.seqtable) # Get the header, which are the SV sequences themselves sv_header = next(seqtab_reader)[1:] for r in seqtab_reader: specimen = r[0] counts = [int(c) for c in r[1:]] seqtab_T[specimen] = pd.SparseArray(counts, dtype=int, fill_value=0) logging.info("DADA2 Seqtable loaded") # Order the SV via their mean relative abundance\ logging.info("Ordering SV by mean relative abundance") ordered_sv_idx = list( (seqtab_T / seqtab_T.sum(axis=0)).mean(axis=1).sort_values(ascending=False).index) # Generate sv labels for each sequence variant, # and generate a dictionary to map sv_id to sequence-variant logging.info("Generating SV names") seq_idx_to_sv_num = { idx: 'sv-%d' % (i + 1) for i, idx in enumerate(ordered_sv_idx) } # Transpose, Reorder and Rename into a new seqtab logging.info("Generating new seqtable") convert_pool = Pool(args.cpus) num_specimens = len(seqtab_T.columns) seqtab_reorder = pd.DataFrame( convert_pool.imap( lookup_sv_counts, zip(range(num_specimens), [ordered_sv_idx] * num_specimens, [seqtab_T] * num_specimens)), columns=[seq_idx_to_sv_num[sv_idx] for sv_idx in ordered_sv_idx], index=seqtab_T.columns) logging.info("Reordered seqtable done") # Annoyingly, we need to pick a representitive actual sequence # from each sv to be it's champion for guppy. # To do so, we will go through each column, find the max count for that sv, # and use that specimen as the champion logging.info("Finding maximum specimen for each SV") max_spec_for_sv = { sv_id: spec for sv_id, spec in seqtab_reorder.apply(lambda c: c.idxmax()).items() } if out_sv_seqs_h is not None: # Write out the sequences in fasta format, using the sv-id's generated above as an ID logging.info("Writing out SV to FASTA") for sv_idx in ordered_sv_idx: out_sv_seqs_h.write(">%s:%s\n%s\n" % (seq_idx_to_sv_num[sv_idx], max_spec_for_sv[seq_idx_to_sv_num[sv_idx]], sv_header[sv_idx])) # Now write the mapping and weights files # Both are headerless CSV format files # map: sequence_id (sv_id:specimen), specimen # weight: sequence_id (sv_id here), specimen_sequence_id (sv_id:specimen here), count # This is a bit of a clunky structure (relating to some historic cruft) if map_writer or weights_writer or long_writer: logging.info("Writing out long, map, and/or weights") for spec, row in seqtab_reorder.iterrows(): row_nonzero = row[row > 0] for sv_id, count in row_nonzero.items(): if map_writer is not None: map_writer.writerow([str(sv_id) + ":" + str(spec), spec]) if weights_writer is not None: weights_writer.writerow([ sv_id + ":" + max_spec_for_sv[sv_id], str(sv_id) + ":" + str(spec), count ]) if long_writer is not None: long_writer.writerow( [spec, sv_id + ":" + max_spec_for_sv[sv_id], count]) if sharetable_fn is not None: sharetable_labels = pd.DataFrame() sharetable_labels['label'] = list(seqtab_reorder.index) sharetable_labels['group'] = "dada2" sharetable_labels['numsvs'] = len(seqtab_reorder.columns) sharetable_labels.head() pd.merge(sharetable_labels, seqtab_reorder, left_on='label', right_index=True).to_csv(sharetable_fn, index=False, sep='\t') # Cleanup. if out_sv_seqs_h: out_sv_seqs_h.close() if map_writer: out_map_h.close() if weights_writer: out_weights_h.close()
), # Category (["a", "b"], "category", pd.Categorical(["a", "b"])), ( ["a", "b"], pd.CategoricalDtype(None, ordered=True), pd.Categorical(["a", "b"], ordered=True), ), # Interval ( [pd.Interval(1, 2), pd.Interval(3, 4)], "interval", pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]), ), # Sparse ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")), # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64)) ), # Series[EA] returns the EA ( pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])), None, pd.Categorical(["a", "b"], categories=["a", "b", "c"]), ), # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal(
def test_ndarray_values(array, expected): l_values = pd.Series(array)._ndarray_values r_values = pd.Index(array)._ndarray_values tm.assert_numpy_array_equal(l_values, r_values) tm.assert_numpy_array_equal(l_values, expected) @pytest.mark.parametrize( "array, attr", [ (np.array([1, 2], dtype=np.int64), None), (pd.Categorical(['a', 'b']), '_codes'), (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'), (pd.core.arrays.integer_array([0, np.nan]), '_data'), (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'), (pd.SparseArray([0, 1]), '_sparse_values'), # TODO: DatetimeArray(add) ]) @pytest.mark.parametrize('box', [pd.Series, pd.Index]) def test_array(array, attr, box): if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: pytest.skip("No index type for {}".format(array.dtype)) result = box(array, copy=False).array if attr: array = getattr(array, attr) result = getattr(result, attr) assert result is array