def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected) # From a mixed type dataframe df["A"] = df["A"].astype(np.int16) df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) expected["A"] = expected["A"].astype(np.int16) expected["B"] = expected["B"].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected)
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected)
class Unstack(object): params = ['int', 'category'] def setup(self, dtype): m = 100 n = 1000 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) if dtype == 'int': values = np.arange(m * m * n).reshape(m * m, n) else: # the category branch is ~20x slower than int. So we # cut down the size a bit. Now it's only ~3x slower. n = 50 columns = columns[:n] indices = np.random.randint(0, 52, size=(m * m, n)) values = np.take(list(string.ascii_letters), indices) values = [pd.Categorical(v) for v in values.T] self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] def time_full_product(self, dtype): self.df.unstack() def time_without_last_row(self, dtype): self.df2.unstack()
def test_unstack_non_unique_index_names(self): idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) with tm.assertRaises(ValueError): df.unstack("c1") with tm.assertRaises(ValueError): df.T.stack("c1")
def test_unstack_non_unique_index_names(self): idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], names=['c1', 'c1']) df = DataFrame([1, 2], index=idx) with pytest.raises(ValueError): df.unstack('c1') with pytest.raises(ValueError): df.T.stack('c1')
def test_stack_unstack(self): stacked = self.frame.stack() stacked_df = DataFrame({'foo': stacked, 'bar': stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() assert_frame_equal(unstacked, self.frame) assert_frame_equal(unstacked_df['bar'], self.frame) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) assert_frame_equal(unstacked_cols.T, self.frame) assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)
class SparseIndex(object): def setup(self): NUM_ROWS = 1000 self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS), 'B': np.random.randint(50, size=NUM_ROWS), 'C': np.random.randint(-10, 10, size=NUM_ROWS), 'D': np.random.randint(-10, 10, size=NUM_ROWS), 'E': np.random.randint(10, size=NUM_ROWS), 'F': np.random.randn(NUM_ROWS)}) self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E']) def time_unstack(self): self.df.unstack()
def test_stack_unstack(self): stacked = self.frame.stack() stacked_df = DataFrame({"foo": stacked, "bar": stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() assert_frame_equal(unstacked, self.frame) assert_frame_equal(unstacked_df["bar"], self.frame) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) assert_frame_equal(unstacked_cols.T, self.frame) assert_frame_equal(unstacked_cols_df["bar"].T, self.frame)
class SimpleReshape(object): def setup(self): arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)] index = MultiIndex.from_arrays(arrays) self.df = DataFrame(np.random.randn(10000, 4), index=index) self.udf = self.df.unstack(1) def time_stack(self): self.udf.stack() def time_unstack(self): self.df.unstack(1)
def test_boxplot_legacy(self): grouped = self.hist_df.groupby(by='gender') with warnings.catch_warnings(): warnings.simplefilter('ignore') axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) tuples = lzip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) grouped = df.unstack(level=1).groupby(level=0, axis=1) axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
def test_stack_unstack(self): df = self.frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) stacked = df.stack() stacked_df = DataFrame({'foo': stacked, 'bar': stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() assert_frame_equal(unstacked, df) assert_frame_equal(unstacked_df['bar'], df) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) assert_frame_equal(unstacked_cols.T, df) assert_frame_equal(unstacked_cols_df['bar'].T, df)
def test_unstack_bool(self): df = DataFrame([False, False], index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), columns=["col"]) rs = df.unstack() xp = DataFrame( np.array([[False, np.nan], [np.nan, False]], dtype=object), index=["a", "b"], columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), ) assert_frame_equal(rs, xp)
def test_boxplot_legacy3(self): tuples = zip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.unstack(level=1).groupby(level=0, axis=1) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
def test_unstack_bool(self): df = DataFrame([False, False], index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]), columns=['col']) rs = df.unstack() xp = DataFrame(np.array([[False, np.nan], [np.nan, False]], dtype=object), index=['a', 'b'], columns=MultiIndex.from_arrays([['col', 'col'], ['c', 'l']])) assert_frame_equal(rs, xp)
class Unstack(object): goal_time = 0.2 def setup(self): m = 100 n = 1000 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) values = np.arange(m * m * n).reshape(m * m, n) self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] def time_full_product(self): self.df.unstack() def time_without_last_row(self): self.df2.unstack()
def test_boxplot_legacy3(self): tuples = zip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.unstack(level=1).groupby(level=0, axis=1) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
def read_gmt(filepath, gene_sets=(), drop_description=True, save_clean=False, collapse=False): """ Read GMT. :param filepath: str; filepath to a .gmt compress :param gene_sets: iterable: list of gene set names to keep :param drop_description: bool; drop Description column (2nd column) or not :param save_clean: bool; Save as .gmt (cleaned version) or not :param collapse: bool; collapse into a list of unique genes or not :return: DataFrame or list; (n_gene_sets, size of the largest gene set) or (n_unique genes) """ # Parse rows = [] with open(filepath) as f: for line in f.readlines(): line_split = line.strip().split('\t') # Sort genes and add as a GMT gene set (row) rows.append(line_split[:2] + sorted([g for g in line_split[2:] if g])) # Make a DataFrame gmt = DataFrame(rows) # Set index gmt.set_index(0, inplace=True) gmt.index.name = 'Gene Set' gmt.sort_index(inplace=True) gmt.columns = ['Description' ] + ['Gene {}'.format(i) for i in range(1, gmt.shape[1])] if save_clean: # Save the cleaned version gmt.to_csv(filepath, sep='\t', header=False) if drop_description or collapse: gmt.drop('Description', axis=1, inplace=True) # Keep specific gene sets if isinstance(gene_sets, str): gene_sets = [gene_sets] if any(gene_sets): gene_sets = sorted(set(gmt.index) & set(gene_sets)) gmt = gmt.ix[gene_sets, :] gmt.dropna(axis=1, how='all', inplace=True) if collapse: return sorted(set(gmt.unstack().dropna())) else: return gmt
def test_unstack_bool(self): df = DataFrame( [False, False], index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), columns=["col"], ) rs = df.unstack() xp = DataFrame( np.array([[False, np.nan], [np.nan, False]], dtype=object), index=["a", "b"], columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), ) tm.assert_frame_equal(rs, xp)
def hierachicalIndexingDataFrame(): #Allows to have multiple (two or more) index levels on an axis. Provides a way for you to work with higher dimensional data in lower dimensional form df = DataFrame(np.arange(12).reshape(4,3), index=[['a','a','b','b'],[1,2,1,2]], columns=[['Ohio','Ohio','Colorado'], ['Green', 'Red','Green']]) print (df) print (df['Ohio']) print (df.unstack()) df.index.names = ['key1','key2'] df.columns.names = ['state','color'] print (df) print (df.swaplevel('key1','key2'))
def test_unstack_to_series(self): # check reversibility data = self.frame.unstack() self.assertTrue(isinstance(data, Series)) undo = data.unstack().T assert_frame_equal(undo, self.frame) # check NA handling data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) data.index = Index(["a", "b", "c"]) result = data.unstack() midx = MultiIndex(levels=[["x", "y"], ["a", "b", "c"]], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) # check composability of unstack old_data = data.copy() for _ in range(4): data = data.unstack() assert_frame_equal(old_data, data)
def test_unstack_mixed_type_name_in_multiindex(self, unstack_idx, expected_values, expected_index, expected_columns): # GH 19966 idx = pd.MultiIndex.from_product([["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"]) df = DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx) result = df.unstack(unstack_idx) expected = DataFrame(expected_values, columns=expected_columns, index=expected_index) tm.assert_frame_equal(result, expected)
def slide_8(): data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) print data result = data.stack() print '***stack()***' print result print '***unstack()***' print result.unstack() print '***unstack(0)***' print result.unstack(0) print "***unstack('state')***" print result.unstack('state') s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) print '***unstack***' print data2.unstack() print '***unstack->stack***' print data2.unstack().stack() print '***unstack->stack(dropna)***' print data2.unstack().stack(dropna=False) df = DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side')) print 'df' print df print "unstack('state')" print df.unstack('state') print "unstack('state').stack('side')" print df.unstack('state').stack('side')
def pca_results(good_data, pca): ''' Create a DataFrame of the PCA results. Includes dimension feature weights and explained variance Visualizes the PCA results :param good_data: DataFrame. all dataset log transformed with 6 columns :param pca: Sklearn Object. a PCA decomposition object already fitted ''' # Dimension indexing dimensions = dimensions = [ 'Dimension {}'.format(i) for i in range(1, len(pca.components_) + 1) ] # PCA components components = DataFrame(round(pca.components_, 4), columns=list(good_data.keys())) components.index = dimensions # PCA explained variance ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) variance_ratios = DataFrame(round(ratios, 4), columns=['Explained Variance']) variance_ratios.index = dimensions # reshape the data to be plotted df_aux = components.unstack().reset_index() df_aux.columns = ['Feature', 'Dimension', 'Variance'] # Create a bar plot visualization fig, ax = plt.subplots(figsize=(10, 6)) # Plot the feature weights as a function of the components barplot(x='Dimension', y='Variance', hue='Feature', data=df_aux, ax=ax) ax.set_ylabel('Feature Weights') ax.set_xlabel('') ax.set_xticklabels(dimensions, rotation=0) # Display the explained variance ratios for i, ev in enumerate(pca.explained_variance_ratio_): ax.text(i - 0.40, ax.get_ylim()[1] + 0.05, 'Explained Variance\n %.4f' % (ev)) # insert a title # ax.set_title('PCA Explained Variance Ratio', # fontsize=16, y=1.10) # Return a concatenated DataFrame return concat([variance_ratios, components], axis=1)
def process_data(result_dic, dic_ne, open_file=False): df = DataFrame(result_dic) df = df.unstack().unstack(level=-1).reset_index().rename( columns={'index': 'model_id'}) columns = ['model_id'] columns += list(dic_ne.values()) df = df[columns] if open_file: try: filename = 'output.txt' df.to_csv(filename, sep='\t', index=False) os.startfile(filename) except Exception as e: print(e) return df
def test_unstack_to_series(self): # check reversibility data = self.frame.unstack() assert isinstance(data, Series) undo = data.unstack().T assert_frame_equal(undo, self.frame) # check NA handling data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) data.index = Index(['a', 'b', 'c']) result = data.unstack() midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) # check composability of unstack old_data = data.copy() for _ in range(4): data = data.unstack() assert_frame_equal(old_data, data)
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] ) tm.assert_frame_equal(result, expected) # From a mixed type dataframe df["A"] = df["A"].astype(np.int16) df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) expected["A"] = expected["A"].astype(np.int16) expected["B"] = expected["B"].astype(np.float64) tm.assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples( [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] ) tm.assert_frame_equal(result, expected)
def test_unstack_to_series(self): # check reversibility data = self.frame.unstack() assert isinstance(data, Series) undo = data.unstack().T assert_frame_equal(undo, self.frame) # check NA handling data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) data.index = Index(['a', 'b', 'c']) result = data.unstack() midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) # check composability of unstack old_data = data.copy() for _ in range(4): data = data.unstack() assert_frame_equal(old_data, data)
def test_boxplot(self): df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) grouped = df.groupby(by="X") _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False) tuples = lzip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False) grouped = df.unstack(level=1).groupby(level=0, axis=1) _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False)
def pivot_data_frame(self, data_frame: pd.DataFrame, pivot_dimensions: List[str], transpose: bool) -> Tuple[pd.DataFrame, bool, bool]: """ Pivot and transpose the data frame. Dimensions including in the `pivot` arg will be unshifted to columns. If `transpose` is True the data frame will be transposed. If there is only index level in the data frame (ie. one dimension), and that dimension is pivoted, then the data frame will just be transposed. If there is a single metric in the data frame and at least one dimension pivoted, the metrics column level will be dropped for simplicity. :param data_frame: The result set data frame :param pivot_dimensions: A list of index aliases for `data_frame` of levels to shift :param transpose: A boolean true or false whether to transpose the data frame. :return: Tuple(The shifted/transposed data frame, is_pivoted, is_transposed) """ is_pivoted = False is_transposed = False if not self._should_data_frame_be_transformed( data_frame, pivot_dimensions, transpose): return self.sort_data_frame(data_frame), is_pivoted, is_transposed # NOTE: Don't pivot a single dimension data frame. This turns the data frame into a series and pivots the # metrics anyway. Instead, transpose the data frame. should_transpose_instead_of_pivot = len(pivot_dimensions) == len( data_frame.index.names) if pivot_dimensions and not should_transpose_instead_of_pivot: data_frame = data_frame.unstack(level=pivot_dimensions) is_pivoted = True if transpose or should_transpose_instead_of_pivot: data_frame = data_frame.transpose() is_transposed = True # If there are more than one column levels and the last level is a single metric, drop the level if isinstance(data_frame.columns, pd.MultiIndex) and 1 == len( data_frame.columns.levels[0]): data_frame.name = data_frame.columns.levels[0][ 0] # capture the name of the metrics column data_frame.columns = data_frame.columns.droplevel( 0) # drop the metrics level return self.sort_data_frame(data_frame), is_pivoted, is_transposed
def test_boxplot(self): df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) grouped = df.groupby(by='X') _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False) tuples = zip(list(string.ascii_letters[:10]), range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False) grouped = df.unstack(level=1).groupby(level=0, axis=1) _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False)
def test_boxplot(self): df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) grouped = df.groupby(by='X') _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False) tuples = list(zip(list(string.ascii_letters[:10]), list(range(10)))) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False) grouped = df.unstack(level=1).groupby(level=0, axis=1) _check_plot_works(grouped.boxplot) _check_plot_works(grouped.boxplot, subplots=False)
def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) df.columns.name = "foo" expected = DataFrame( [[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"]), ) expected.index.name = "bar" result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) tm.assert_frame_equal(result, expected)
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = data.unstack(fill_value=-1) expected = DataFrame( {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16 ) tm.assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame( {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float ) tm.assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame( {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]} ).set_index(["x", "y", "z"]) unstacked = df.unstack(["x", "y"], fill_value=0) key = ("w", "b", "j") expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) stacked = unstacked.stack(["x", "y"]) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] tm.assert_frame_equal(result, df) # From a series s = df["w"] result = s.unstack(["x", "y"], fill_value=0) expected = unstacked["w"] tm.assert_frame_equal(result, expected)
def test_unstack_mixed_extension_types(self, level): index = pd.MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 1)], names=["a", "b"]) df = DataFrame( { "A": pd.core.arrays.integer_array([0, 1, None]), "B": pd.Categorical(["a", "a", "b"]), }, index=index, ) result = df.unstack(level=level) expected = df.astype(object).unstack(level=level) expected_dtypes = Series([df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns) tm.assert_series_equal(result.dtypes, expected_dtypes) tm.assert_frame_equal(result.astype(object), expected)
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack(fill_value=-1) expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, index=['x', 'y', 'z'], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame({'x': ['a', 'a', 'b'], 'y': ['j', 'k', 'j'], 'z': [0, 1, 2], 'w': [0, 1, 2]}).set_index(['x', 'y', 'z']) unstacked = df.unstack(['x', 'y'], fill_value=0) key = ('w', 'b', 'j') expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) assert_series_equal(result, expected) stacked = unstacked.stack(['x', 'y']) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] assert_frame_equal(result, df) # From a series s = df['w'] result = s.unstack(['x', 'y'], fill_value=0) expected = unstacked['w'] assert_frame_equal(result, expected)
def test_info_memory_usage_bug_on_multiindex(): # GH 14308 # memory usage introspection should not materialize .values def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 M = len(uppercase) index = MultiIndex.from_product( [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"] ) df = DataFrame({"value": np.random.randn(N * M)}, index=index) unstacked = df.unstack("id") assert df.values.nbytes == unstacked.values.nbytes assert memory_usage(df) > memory_usage(unstacked) # high upper bound assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_unstack_long_index(self): # PH 32624: Error when using a lot of indices to unstack. # The error occurred only, if a lot of indices are used. df = DataFrame( [[1]], columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]), index=pd.MultiIndex.from_tuples( [[0, 0, 1, 0, 0, 0, 1]], names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"], ), ) result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"]) expected = DataFrame( [[1]], columns=pd.MultiIndex.from_tuples( [[0, 0, 1, 0, 0, 0, 1]], names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"], ), index=pd.Index([0], name="i1"), ) tm.assert_frame_equal(result, expected)
def unstack_time_series(time_series: pd.DataFrame) -> pd.DataFrame: """ Given a route, stop, or feed time series of the form output by the functions, :func:`compute_stop_time_series`, :func:`compute_route_time_series`, or :func:`compute_feed_time_series`, respectively, unstack it to return a DataFrame of with the columns: - ``"datetime"`` - the columns ``time_series.columns.names`` - ``"value"``: value at the datetime and other columns """ col_names = time_series.columns.names return (time_series.unstack().pipe( pd.DataFrame).reset_index().rename(columns={ 0: "value", "level_2": "datetime" }) # Reorder columns .filter(["datetime"] + col_names + ["value"]).sort_values(["datetime"] + col_names))
def test_unstack_with_missing_int_cast_to_float(): # https://github.com/pandas-dev/pandas/issues/37115 df = DataFrame( {"a": ["A", "A", "B"], "b": ["ca", "cb", "cb"], "v": [10] * 3} ).set_index(["a", "b"]) # add another int column to get 2 blocks df["is_"] = 1 assert len(df._mgr.blocks) == 2 result = df.unstack("b") result[("is_", "ca")] = result[("is_", "ca")].fillna(0) expected = DataFrame( [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], index=Index(["A", "B"], dtype="object", name="a"), columns=MultiIndex.from_tuples( [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], ), ) tm.assert_frame_equal(result, expected)
def load_frame(cls, session, experiment): """ Load part of the table into a well-formatted pandas.DataFrame. session can be any object with the execute method. """ table = cls.__table__ stmt = select([table.c.feature, table.c.point, table.c.level]).where( table.c.experiment_id == experiment.id) query = session.execute(stmt) df = DataFrame(iter(query), columns=query.keys()) df.set_index(["feature", "point"], inplace=True) series = df.unstack() series.columns = series.columns.droplevel() # time points can become unsorted in database, sort them series = series.reindex_axis( series.columns[ argsort(series.columns.astype(int).values)], axis=1, copy=False) if experiment.knockouts is not None: series.loc[[ko.feature for ko in experiment.knockouts]] = nan return series
def cluster_columns(colsim: DataFrame, clus: AgglomerativeClustering, pi=None) -> Dict[int, int]: """Cluster columns from different tables together within a cluster of tables Column similarities within one table are set to 0 to prevent different columns within one table from linking. Args: colsim: Dataframe of column similarities clus: Agglomerative clustering method pi: Partition information (for debugging) Returns: ``{column index: partition column index}`` """ # Don't allow different columns within one table to link colsim = colsim[(colsim["ti1"] != colsim["ti2"]) | (colsim["ci1"] == colsim["ci2"])] colsim = colsim.set_index(["ci1", "ci2"])[0] colsim = colsim[~colsim.index.duplicated()] # Make symmetric distance matrix d = 1 - colsim.unstack().sort_index(0).sort_index(1).fillna(0) d = pd.DataFrame(np.minimum(d, d.T)) log.debug(f"Clustering {d.shape} column similarities") try: partcols = clus.fit_predict(d) except: partcols = range(len(d.index)) # Sort cluster columns by frequency partcol_rank = { # type: ignore pci: r for r, (pci, _) in enumerate(Counter(partcols).most_common()) } partcols = [partcol_rank[pci] for pci in partcols] return dict(zip(d.index, partcols))
def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values from string import ascii_uppercase as uppercase def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 M = len(uppercase) index = pd.MultiIndex.from_product( [list(uppercase), pd.date_range("20160101", periods=N)], names=["id", "date"] ) df = DataFrame({"value": np.random.randn(N * M)}, index=index) unstacked = df.unstack("id") self.assertEqual(df.values.nbytes, unstacked.values.nbytes) self.assertTrue(memory_usage(df) > memory_usage(unstacked)) # high upper bound self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000)
def to_pico_stream(df: pd.DataFrame) -> pd.DataFrame: """Flattens a PicoLog PLW Player data dataframe to a virtual data-stream - simulating sequential acquisition of data channel-by-channel. For an input dataframe of shape (num_samples, num_channels), the output data-stream will have length num_samples x num_channels. Args: df: PicoLog PLW Player dataframe, where each row has temperature measurements across PicoLogger acquisition channels. Returns: Equivalent flattened data-stream dataframe, where each row has a temperature measurement from a single PicoLogger acquisition channel. index: None (enumeration of entries) columns: `channel`, `temp` """ channels_order = df.columns.values # Melt the dataframe and rename columns df = df.unstack().reset_index() df.columns = ['channel', 'Time', 'temp'] # attach a categorical ordered data type to channel values so that they # maintain the acquisition order when sorting t = pd.CategoricalDtype(categories=channels_order, ordered=True) df['channel'] = pd.Series(df['channel'], dtype=t) # Sort values by time and channel order df.sort_values(['Time', 'channel'], inplace=True) # Reset `Time` index to give each entry virtual time id df.set_index('Time', inplace=True) df.reset_index(drop=True, inplace=True) return df
def long_to_wide(table: pd.DataFrame, keycolnames: List[str], varcolname: str) -> pd.DataFrame: varcol = table[varcolname] if varcol.dtype != object and not hasattr(varcol, 'cat'): error = ( 'Column "%s" was auto-converted to Text because column names must ' 'be text.' % varcolname) quick_fixes = [{ 'text': 'Convert "%s" to text' % varcolname, 'action': 'prependModule', 'args': ['converttotext', { 'colnames': varcolname }], }] na = varcol.isnull() varcol = varcol.astype(str) varcol[na] = np.nan table[varcolname] = varcol else: error = None quick_fixes = None table.set_index(keycolnames + [varcolname], inplace=True, drop=True) if np.any(table.index.duplicated()): return 'Cannot reshape: some variables are repeated' table = table.unstack() table.columns = [col[-1] for col in table.columns.values] table.reset_index(inplace=True) if error is not None: return { 'dataframe': table, 'error': error, 'quick_fixes': quick_fixes, } else: return table
def test_unstack_tuplename_in_multiindex(self): # GH 19966 idx = pd.MultiIndex.from_product([["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")]) df = DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx) result = df.unstack(("A", "a")) expected = DataFrame( [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]], columns=pd.MultiIndex.from_tuples( [ ("d", "a"), ("d", "b"), ("d", "c"), ("e", "a"), ("e", "b"), ("e", "c"), ], names=[None, ("A", "a")], ), index=pd.Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected)
def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values from string import ascii_uppercase as uppercase def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 M = len(uppercase) index = pd.MultiIndex.from_product([list(uppercase), pd.date_range('20160101', periods=N)], names=['id', 'date']) df = DataFrame({'value': np.random.randn(N * M)}, index=index) unstacked = df.unstack('id') assert df.values.nbytes == unstacked.values.nbytes assert memory_usage(df) > memory_usage(unstacked) # high upper bound assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values from string import ascii_uppercase as uppercase def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 M = len(uppercase) index = pd.MultiIndex.from_product( [list(uppercase), pd.date_range('20160101', periods=N)], names=['id', 'date']) df = DataFrame({'value': np.random.randn(N * M)}, index=index) unstacked = df.unstack('id') self.assertEqual(df.values.nbytes, unstacked.values.nbytes) self.assertTrue(memory_usage(df) > memory_usage(unstacked)) # high upper bound self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000)
def plot_avg_df(df: pd.DataFrame, paper_mode: bool = False, color_group_name=None, horizontal: bool = True): grouping = ["Scoring", "Attribute"] coloring = ["Attribute"] legend_labels = df[coloring].drop_duplicates().iloc[::-1] errors = df[["STD"]] df = df.set_index(grouping) fig_size, font_size, legend_size, grid, title, label_rot, x_al = plot_style(paper_mode, title=f"{color_group_name} Comparison", horizontal=horizontal) unstacked_df = df.unstack() if horizontal: ax = unstacked_df["Mean"].plot.barh(title=title, grid=grid, alpha=0.85, rot=label_rot, figsize=fig_size, xerr=unstacked_df["STD"]) else: ax = unstacked_df["Mean"].plot(kind="bar", title=title, grid=grid, alpha=0.85, rot=label_rot, figsize=fig_size, yerr=unstacked_df["STD"]) axis_modification(ax, x_label="Benchmark", y_label=f"Benchmark Score", x_al=x_al, font_size=font_size, horizontal=horizontal) legend(ax=ax, paper_mode=paper_mode, legend_labels=legend_labels, title=color_group_name, legend_size=legend_size, horizontal=horizontal) plt.show()
def long_to_wide( table: pd.DataFrame, keycolnames: List[str], varcolname: str ) -> pd.DataFrame: varcol = table[varcolname] if varcol.dtype != object and not hasattr(varcol, "cat"): error = ( 'Column "%s" was auto-converted to Text because column names must ' "be text." % varcolname ) quick_fixes = [ { "text": 'Convert "%s" to text' % varcolname, "action": "prependModule", "args": ["converttotext", {"colnames": varcolname}], } ] na = varcol.isnull() varcol = varcol.astype(str) varcol[na] = np.nan table[varcolname] = varcol else: error = None quick_fixes = None table.set_index(keycolnames + [varcolname], inplace=True, drop=True) if np.any(table.index.duplicated()): return "Cannot reshape: some variables are repeated" table = table.unstack() table.columns = [col[-1] for col in table.columns.values] table.reset_index(inplace=True) if error is not None: return {"dataframe": table, "error": error, "quick_fixes": quick_fixes} else: return table
def transform_dataframe(self, dataframe): """ Use matplotlib to compute boxplot statistics on e.g. timeseries data. """ grouping = self.get_grouping(dataframe) group_field = self.get_group_field() header_fields = self.get_header_fields() if "series" in grouping: # Unstack so each series is a column for i in range(len(header_fields) + 1): dataframe = dataframe.unstack() groups = { col: dataframe[col] for col in dataframe.columns } if "year" in grouping: interval = "year" elif "month" in grouping: interval = "month" else: interval = None # Compute stats for each column, potentially grouped by year all_stats = [] for header, series in groups.items(): if interval: series_stats = self.boxplots_for_interval(series, interval) else: interval = None series_stats = [self.compute_boxplot(series)] series_infos = [] for series_stat in series_stats: series_info = {} if isinstance(header, tuple): value_name = header[0] col_values = header[1:] else: value_name = header col_values = [] col_names = zip(dataframe.columns.names[1:], col_values) for col_name, value in col_names: series_info[col_name] = value for stat_name, val in series_stat.items(): if stat_name == interval: series_info[stat_name] = val else: series_info[value_name + '-' + stat_name] = val series_infos.append(series_info) all_stats += series_infos dataframe = DataFrame(all_stats) if 'series' in grouping: index = header_fields + [group_field] unstack = len(header_fields) if interval: index = [interval] + index unstack += 1 else: index = [interval] unstack = 0 dataframe.set_index(index, inplace=True) dataframe.columns.name = '' for i in range(unstack): dataframe = dataframe.unstack() # Remove blank columns dataframe = dataframe.dropna(axis=1, how='all') return dataframe
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype('i8') for lev in self.ymd.index.levels] self.ymd.index.names = ['year', 'month', 'day'] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level='month') result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level='month') result = op(self.ymd, month_sums, level='month') broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd['A'], month_sums['A'], level='month') broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum) expected = op(self.ymd['A'], broadcasted) assert_series_equal(result, expected) _check_op('sub') _check_op('add') _check_op('mul') _check_op('div') def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s[42:65] expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s[49:51] assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s[42:65]).all()) self.assert_(notnull(s[:42]).all()) self.assert_(notnull(s[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_fancy_2d(self): result = self.frame.ix['foo', 'B'] expected = self.frame.xs('foo')['B'] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix['B', 'foo'] expected = ft.xs('B')['foo'] assert_series_equal(result, expected) def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) res = frame.ix[1:2] exp = frame[2:] assert_frame_equal(res, exp) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series[2:] assert_series_equal(res, exp) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[('bar', 'two'), 'B'] = 5 self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[('bar', 'two'), 1] = 7 self.assertEquals(df.ix[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled['prm1'])) self.assert_(com.is_float_dtype(deleveled['prm2'])) def test_sortlevel_by_name(self): self.frame.index.names = ['first', 'second'] result = self.frame.sortlevel(level='second') expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ['A', 'B', 'C']) def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_unstack_bug(self): df = DataFrame({'state': ['naive','naive','naive', 'activ','activ','activ'], 'exp':['a','b','b','b','a','a'], 'barcode':[1,2,3,4,1,3], 'v':['hi','hi','bye','bye','bye','peace'], 'extra': np.arange(6.)}) result = df.groupby(['state','exp','barcode','v']).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1) assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1) assert_frame_equal(unstacked, expected) def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_join(self): a = self.frame.ix[:5, ['A']] b = self.frame.ix[2:, ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) swapped2 = self.frame['A'].swaplevel('first', 'second') self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel('second', 'first') self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA' : self.frame, 'ItemB' : self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1,2,3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)])) y = Series(data=[4,5,6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var'] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd['A'].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd['A'].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df['A'].ix[2000, 4] = 1 exp['A'].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df['A'].ix[14] = 5 self.assertEquals(df['A'][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' self.ymd['F'] = 2 unstacked = self.ymd.unstack('month') self.assert_(unstacked['A', 1].dtype == np.float64) self.assert_(unstacked['E', 1].dtype == np.object_) self.assert_(unstacked['F', 1].dtype == np.float64) def test_partial_ix_missing(self): result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]['A'] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() self.ymd.T.to_html()
#casetovars varstocases d = {'one':[1,1],'two':[2,2]} i = ['a','b'] # Create dataframe df = DataFrame(data = d, index = i) df #varstocases df.stack() #casestoVars df.unstack() #aggregate d = {'one':[1,1,1,1,1],'two':[2,2,2,2,2],'letter':['a','a','b','b','c']} # Create dataframe df = DataFrame(d) df one = df.groupby('letter') # Apply sum function one.sum() letterone = df.groupby(['letter','one']).sum()
print data # 使用stack方法,将列旋转为行,得到一个Series result = data.stack() print result # 对于一个层次化的Series,可以使用unstack来重排为一个DataFrame # 默认情况是最内层 print result.unstack() # 可以通过传入参数分层级别的编号或者名称来对别的级别的unstack操作 print result.unstack(0) print result.unstack('state') # 如果不是所有级别值都能在各分组中找到的话,那么unstack会引入缺失值 s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) print data2 print data2.unstack() # stack会滤除缺失数据,所以该运算是可逆的 print data2.unstack().stack() print data2.unstack().stack(dropna=False) # 对DataFrame进行unstack进行操作时, 被旋转的会变成最内层 df = DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side')) print df.unstack('state') print df.unstack('state').stack('side')
result.unstack(0) result.unstack('state') s s1 s1 = Series([0,1,2,3],index=['a','b','c','d']) s2=Series([4,5,6],index=['c','d','e']) data2 = pd.concat([s1,s2],keys=['one','two']) data2 data2.unstrack() data2.unstack() data2.unstack().stack() data2.unstack().stack(dropna=False) df = DataFrame({'left': result,'right': result+5},colums = pd.Index(['left','right'],name='side')) df = DataFrame({'left': result,'right': result+5},columns = pd.Index(['left','right'],name='side')) df df.unstack('state') df.unstack('side') df.unstack('state').stack('side') ldata[:10] !vi pythonio.py data = DataFrame({'k1':['one'] * 3 + ['two'] * 4, 'k2':[1,1,2,3,3,4,4]}) data data.duplicated() data.drop_duplicated() data.drop_duplicates() data['v1'] = range(7) data.drop_duplicates(['k1']) data.drop_duplicates(['k1','k2'],take_last=True) data.drop_duplicates(['k1','k2'],keep='last') data = DataFrame({'food':['bacon','pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7,.5,8,3,5,6]}) data = DataFrame({'food':['bacon','pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
""" :param obj: :type obj: pandas.core.frame.DataFrame :return: :rtype: pandas.core.frame.DataFrame """ return df3.corrwith(obj) df1.apply(corr_df3) df1.apply(lambda x: df3.corrwith(x)) df3.apply(lambda x: df1.corrwith(x)) df3.index df3 df3.ix["b", ].ix[1:, ] df3.ix["a":"b",].ix[1:, ] df3.unstack().unstack() frame = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']]) frame frame.index.names = ['key1', 'key2'] frame.columns.names = ['state', 'color'] frame frame.swaplevel("key1", "key2") frame frame.sortlevel(0) frame frame.sum(level="key1")
# 9.3 Dataframe多层索引切片 df7 = DataFrame(np.random.randint(0, 150, size=(8, 2)), index=pd.MultiIndex.from_product([list("abcd"), ["期中", "期末"]]), columns=["Python", "高数"]) df7.loc['a':'c'] df7['a':'c'] '''10 索引的堆(stack)!''' '''stack() 小技巧:使用stack()的时候,level等于哪一个,哪一个就消失,出现在行里。 unstack()''' # 10.1 索引的堆(stack)---将列索引堆至行索引 df5.stack(level=0) # 第1层消失,出现在行里 df5.stack(level=1) # 第2层消失,出现在行里 df5.stack(level=2) # 第3层消失,出现在行里 # 10.2 反堆---就是第n层消失,出现在列里 df4.unstack(level=0) # 第1层消失,出现在列里 df5.stack(level=1) # 第2层消失,出现在列里 df5.stack(level=2) # 第3层消失,出现在列里 # 10.3 unstack---将行索引堆至列索引 df7.unstack() # 将行索引堆至列索引 df7.unstack(level=0) # 将行索引堆至列索引 df7.unstack(level=1) # 将行索引堆至列索引 df7.unstack(level=-1) # 将行索引堆至列索引 # 聚合操作 '''小技巧:和unstack()相反,聚合的时候,axis等于哪一个,哪一个就会进行计算''' df3.mean() df3.mean(axis=0) # 列平均 df3.mean(axis=1) # 行平均 df3.mean(axis='index') # 列平均
lr_auc_test, svc_auc_test, rf_auc_test, knn_auc_test, gaussian_auc_test ] models = DataFrame({ 'Training Accuracy': train_acc, 'Testing Accuracy': test_acc, "Cross-Validation Accuracy": cross_val_acc, 'Training AUC': train_auc, 'Testing AUC': test_auc }) models.index = [ 'Logistic Regression', 'Support Vector Machines ', 'Random Forests', 'K-Nearest Neighbors', 'Gaussian Naive Bayes' ] models models1 = DataFrame({'Accuracy': models.unstack()}).reset_index() # plot accuracies plt.figure(figsize=(8, 7)) fig_models = sns.barplot(x='level_0', y='Accuracy', hue='level_1', data=models1) fig_models.set(xlabel='Accuracy Metric', ylabel='Accuracy') fig_models.set_title('The Accuracy of All Models Over Five Metrics') x = zip(X_train.columns, np.transpose(logreg.coef_)) x1 = pd.DataFrame(list(x)) x1.head() # get Correlation Coefficient for each feature using Logistic Regression logreg_df = pd.DataFrame(list(zip(X_train.columns,
#encoding:utf-8 from pandas import Series,DataFrame a=[['刘玄德','男','语文',98.],['刘玄德','男','体育',60.],['关云长','男','数学',60.],['张飞','女','语文',100.],['关云长','男','语文',100.]] af=DataFrame(a,columns=['name','sex','course','score']) af=af.sort(['name']) print af af.set_index(['name','sex','course'],inplace='TRUE') print af t1=af.unstack(level=2) print t1 t2=t1.mean(axis=1,skipna=True) t1['平均分']=t2 t1.fillna(0)