def align_partial_results( df: pd.DataFrame, progr_key: str, # progression key metrics: List[str], interpolation: str = "slinear", # TODO: Allow normalizing progr_key (e.g. subtract min time stamp) ) -> Tuple[Dict[str, pd.DataFrame], Dict[str, pd.DataFrame]]: """Helper function to align partial results with heterogeneous index Args: df: The DataFrame containining the raw data (in long format). progr_key: The key of the column indexing progression (such as the number of training examples, timestamps, etc.). metrics: The names of the metrics to consider. interpolation: The interpolation method used to fill missing values (if applicable). See `pandas.DataFrame.interpolate` for available options. Returns: A two-tuple containig a dict mapping the provided metric names to the index-normalized and interpolated mean (sem). """ missing_metrics = set(metrics) - set(df["metric_name"]) if missing_metrics: raise ValueError( f"Metrics {missing_metrics} not found in input dataframe") # select relevant metrics df = df[df["metric_name"].isin(metrics)] # drop arm names (assumes 1:1 map between trial indices and arm names) df = df.drop("arm_name", axis=1) # set multi-index over trial, metric, and progression key df = df.set_index(["trial_index", "metric_name", progr_key]) # sort index df = df.sort_index(level=["trial_index", progr_key]) # drop sem if all NaN (assumes presence of sem column) has_sem = not df["sem"].isnull().all() if not has_sem: df = df.drop("sem", axis=1) # create the common index that every map result will be re-indexed w.r.t. index_union = df.index.levels[2].unique() # loop through (trial, metric) combos and align data dfs_mean = defaultdict(list) dfs_sem = defaultdict(list) for tidx in df.index.levels[ 0]: # this could be slow if there are many trials for metric in df.index.levels[1]: # grab trial+metric sub-df and reindex to common index df_ridx = df.loc[(tidx, metric)].reindex(index_union) # interpolate / fill missing results (only fills in between points, # does not extrapolate) # TODO: Allow passing of additional kwargs to `interpolate` # TODO: Allow using an arbitrary prediction model for this instead try: df_interp = df_ridx.interpolate(method=interpolation, limit_area="inside") except ValueError as e: df_interp = df_ridx logger.info(f"Got exception `{e}` during interpolation. " "Using uninterpolated values instead.") # renaming column to trial index, append results dfs_mean[metric].append(df_interp["mean"].rename(tidx)) if has_sem: dfs_sem[metric].append(df_interp["sem"].rename(tidx)) # combine results into output dataframes dfs_mean = { metric: pd.concat(dfs, axis=1) for metric, dfs in dfs_mean.items() } dfs_sem = { metric: pd.concat(dfs, axis=1) for metric, dfs in dfs_sem.items() } return dfs_mean, dfs_sem
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product( [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)]) df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if ( a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]] tm.assert_frame_equal(result, expected) expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C2" or c == "C3")]] result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ["foo"])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc["A", "a"] expected = DataFrame( { "bar": [1, 5, 9], "foo": [0, 4, 8] }, index=Index([1, 2, 3], name="two"), columns=Index(["bar", "foo"], name="lvl1"), ) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.to_numpy())), index=ix) result = s.loc["A1":"A3", :, ["C1", "C3"]] expected = s.loc[[( a, b, c, d, ) for a, b, c, d in s.index.values if ( a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) msg = ("cannot index with a boolean indexer " "that is not the same length as the index") with pytest.raises(ValueError, match=msg): df.loc[(slice(None), np.array([True, False])), :] with pytest.raises(KeyError, match=r"\[1\] not in index"): # slice(None) is on the index, [1] is on the columns, but 1 is # not in the columns, so we raise # This used to treat [1] as positional GH#16396 df.loc[slice(None), [1]] result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted assert df.index.lexsort_depth == 2 df = df.sort_index(level=1, axis=0) assert df.index.lexsort_depth == 0 msg = ("MultiIndex slicing requires the index to be " r"lexsorted: slicing on levels \[1\], lexsort depth 0") with pytest.raises(UnsortedIndexError, match=msg): df.loc[(slice(None), slice("bar")), :] # GH 16734: not sorted, but no real slicing result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :])
def test_per_axis_per_level_setitem(self): # test index maker idx = pd.IndexSlice # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df_orig = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity df = df_orig.copy() df.loc[(slice(None), slice(None)), :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) # index df = df_orig.copy() df.loc[(slice(None), [1]), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, 1] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) # columns df = df_orig.copy() df.loc[:, (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 tm.assert_frame_equal(df, expected) # both df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc["A", "a"] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([[100, 100], [100, 100]], dtype="int64") expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) # not enough values df = df_orig.copy() msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([[100], [100, 100]], dtype="int64") msg = "Must have equal len keys and value when setting with an iterable" with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([100, 100, 100, 100], dtype="int64") # with an alignable rhs df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = (df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() rhs.loc[:, ("c", "bah")] = 10 df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected)
def test_sort_index_and_reconstruction(self): # GH#15622 # lexsortedness should be identical # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list("ab")) expected = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_tuples( [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] ), ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex( levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] ), ) result = result.sort_index() assert result.index.is_lexsorted() tm.assert_frame_equal(result, expected) concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # GH#14015 df = DataFrame( [[1, 2], [6, 7]], columns=MultiIndex.from_tuples( [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], names=["l1", "Date"], ), ) df.columns = df.columns.set_levels( pd.to_datetime(df.columns.levels[1]), level=1 ) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic result = df.sort_index(axis=1, level=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic
def __init__(self, data: pd.DataFrame, strategy: Type[Strategy], *, cash: float = 10000, commission: float = .0, margin: float = 1., trade_on_close=False ): """ Initialize a backtest. Requires data and a strategy to test. `data` is a `pd.DataFrame` with columns: `Open`, `High`, `Low`, `Close`, and (optionally) `Volume`. If any columns are missing, set them to what you have available, e.g. df['Open'] = df['High'] = df['Low'] = df['Close'] DataFrame index can be either datetime index (timestamps) or a monotonic range index (i.e. a sequence of periods). `strategy` is a `backtesting.backtesting.Strategy` _subclass_ (not an instance). `cash` is the initial cash to start with. `commission` is the commision ratio. E.g. if your broker's commission is 1% of trade value, set commission to `0.01`. Note, if you wish to account for bid-ask spread, you cam approximate doing so by increasing the commission, e.g. set it to `0.0002` for commission-less forex trading where the average spread is roughly 0.2‰ of asking price. `margin` is the required margin (ratio) of a leveraged account. No difference is made between initial and maintenance margins. To run the backtest using e.g. 50:1 leverge that your broker allows, set margin to `0.02` (1 / leverage). If `trade_on_close` is `True`, market orders will be executed with respect to the current bar's closing price instead of the next bar's open. """ if not (isinstance(strategy, type) and issubclass(strategy, Strategy)): raise TypeError('`strategy` must be a Strategy sub-type') if not isinstance(data, pd.DataFrame): raise TypeError("`data` must be a pandas.DataFrame with columns") if not isinstance(commission, Number): raise TypeError('`commission` must be a float value, percent of ' 'entry order price') data = data.copy(deep=False) # Convert index to datetime index if (not data.index.is_all_dates and not isinstance(data.index, pd.RangeIndex) and # Numeric index with most large numbers (data.index.is_numeric() and (data.index > pd.Timestamp('1975').timestamp()).mean() > .8)): try: data.index = pd.to_datetime(data.index, infer_datetime_format=True) except ValueError: pass if 'Volume' not in data: data['Volume'] = np.nan if len(data) == 0: raise ValueError('OHLC `data` is empty') if len(data.columns & {'Open', 'High', 'Low', 'Close', 'Volume'}) != 5: raise ValueError("`data` must be a pandas.DataFrame with columns " "'Open', 'High', 'Low', 'Close', and (optionally) 'Volume'") if data[['Open', 'High', 'Low', 'Close']].isnull().values.any(): raise ValueError('Some OHLC values are missing (NaN). ' 'Please strip those lines with `df.dropna()` or ' 'fill them in with `df.interpolate()` or whatever.') if not data.index.is_monotonic_increasing: warnings.warn('Data index is not sorted in ascending order. Sorting.', stacklevel=2) data = data.sort_index() if not data.index.is_all_dates: warnings.warn('Data index is not datetime. Assuming simple periods.', stacklevel=2) self._data = data # type: pd.DataFrame self._broker = partial( _Broker, cash=cash, commission=commission, margin=margin, trade_on_close=trade_on_close, length=len(data) ) self._strategy = strategy self._results = None
def hampel_filter_with_dev_df(df: pd.DataFrame, vals_col: str, time_col=None, win_size=30, num_dev=3, center_win=True) -> pd.DataFrame: """ This function takes in dataframe containing time series of values, applies Hampel filter on these values, and returns dataframe consisting of original values columns along with the Hampel filtered data, outlier values, boolean flags where outliers found, values for lower deviation from median, values for upper deviation from median. Parameters ---------- df: pd.DataFrame data from containing time series that needs to be Hampel filtered vals_col: str Single column name that contains values that need to be filtered. time_col: str Name of column that contains dates or timestamps win_size: int Size of sliding window for filtering. Essentially the number of time steps to be considered when filtering. num_dev: int Number of standard deviations to consider when detecting values that would be considered outliers. center_win: Boolean Boolean value that determines whether the window is centered about the point being filtered? Default=True. If False, point is at the leading edge (i.e. right side) of window calculation. Returns ------- Function returns a full dataframe consisting of original values columns along with the Hampel filtered data, outlier values and boolean flags where outliers found. """ if (time_col != None): if (time_col not in list(df.columns)): raise Exception( "Timestamp column '{}' is missing!".format(time_col)) elif (time_col in list(df.columns)): if (not np.issubdtype(df[time_col].dtype, np.datetime64)): if (not np.issubdtype( pd.to_datetime(df[time_col]).dtype, np.datetime64)): raise Exception( "Timestamp column '{}' is not np.datetime64".format( time_col)) else: df[time_col] = pd.to_datetime(df[time_col]) drop_cols = set(df.columns) - set([time_col, vals_col]) # Not really filtered at this point. Just naming appropriately ahead of time. orig_vals = df.sort_values( time_col, ascending=True).set_index(time_col).copy() filtered = orig_vals.drop(columns=drop_cols).copy() else: df[time_col] = pd.to_datetime(df[time_col]) drop_cols = set(df.columns) - set([time_col, vals_col]) # Not really filtered at this point. Just naming appropriately ahead of time. orig_vals = df.sort_values( time_col, ascending=True).set_index(time_col).copy() filtered = orig_vals.drop(columns=drop_cols).copy() elif (time_col == None): if (not isinstance(df.index, pd.DatetimeIndex)): raise Exception("DataFrame index is not pd.DatetimeIndex") else: df.sort_index(inplace=True) drop_cols = set(df.columns) - set([vals_col]) orig_vals = df.copy() filtered = orig_vals.drop(columns=drop_cols).copy() # Scale factor for estimating standard deviation based upon median value L = 1.4826 # Calculate rolling median for the series rolling_median = filtered.rolling(window=int(win_size), center=center_win, min_periods=1).median() # Define a lambda function to apply to the series to calculate Median Absolute Deviation MAD = lambda x: np.median(np.abs(x - np.median(x))) # Calculate rolling MAD series rolling_MAD = filtered.rolling(window=(win_size), center=center_win, min_periods=1).apply(MAD) # Calculate threshold level for filtering based upon the number of standard deviation and # constant scaling factor L. threshold = int(num_dev) * L * rolling_MAD # Difference between original values and rolling median # Again, "filtered" not yet filtered at this point. difference = np.abs(filtered - rolling_median) median_minus_threshold = rolling_median - threshold median_minus_threshold.rename(columns={vals_col: 'LOWER_DEV'}, inplace=True) median_plus_threshold = rolling_median + threshold median_plus_threshold.rename(columns={vals_col: 'UPPER_DEV'}, inplace=True) ''' # TODO: Look at logic here to possibly not mark as an outlier if threshold value is 0.0 ''' # Flag outliers outlier_idx = difference > threshold # Now it's filtered. This should replace original values with filtered values from the rolling_median # dataframe where outliers were found. filtered[outlier_idx] = rolling_median[outlier_idx] filtered.rename(columns={vals_col: 'FLTRD_VAL'}, inplace=True) # Capture outliers column outliers = orig_vals[outlier_idx].rename(columns={ vals_col: 'OUTLIER_VAL' }).drop(columns=drop_cols) # Capture outlier IS_OUTLIER column outlier_idx.rename(columns={vals_col: 'IS_OUTLIER'}, inplace=True) # The following returns a full dataframe consisting of original values columns # along with the Hampel filtered data, outlier values and boolean flags where outliers found. return pd.concat([ orig_vals, filtered, outliers, outlier_idx, median_minus_threshold, median_plus_threshold ], axis=1)
obj = Series([2,3,5,6],index = ['d','c','b','a']) # Series에서 인덱스번호를 기준으로 정렬하는 방법 obj.sort_index() obj.sort_index(ascending = False) # Series에서 값을 기준으로 정렬하는 방법 obj.sort_values()) obj.sort_values(ascending = False) import numpy as np df = DataFrame(np.arange(8).reshape(2,4),index = ['two','one'],columns = ['b','d','a','c']) df.sort_index() df.sort_index(ascending = False) df.sort_index(axis= 1) df.sort_index(ascending=False,axis = 1) df.sort_values(by = 'b', axis = 0, ascending = False) df.sort_values(by = 'one', axis = 1, ascending = False) obj1 = Series([78,88,92,79,67,91,70,86,90,90]) obj1.sort_values() obj1.rank() obj1.rank(ascending = False) #같은 값이 나와서 .5단위로 나온다. obj1.rank(ascending = False,method='average') #.5 단위시 그냥 .5 obj1.rank(ascending = False,method='min') #.5 단위시 낮은값으로
def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) arr = np.random.randn(len(index), 1) df = DataFrame(arr, index=index, columns=['val']) result = df.val['D'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['A'] pytest.raises(KeyError, f) def f(): df.val['X'] pytest.raises(KeyError, f) # A is treated as a special Timestamp index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) df = DataFrame(arr, index=index, columns=['val']) result = df.val['A'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['X'] pytest.raises(KeyError, f) # GH 7866 # multi-index slicing with missing indexers idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = Series(np.arange(3, dtype='int64'), index=exp_idx).sort_index() result = s.loc[['A']] tm.assert_series_equal(result, expected) result = s.loc[['A', 'D']] tm.assert_series_equal(result, expected) # not any values found pytest.raises(KeyError, lambda: s.loc[['D']]) # empty ok result = s.loc[[]] expected = s.iloc[[]] tm.assert_series_equal(result, expected) idx = pd.IndexSlice expected = Series([0, 3, 6], index=MultiIndex.from_product( [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() result = s.loc[idx[:, ['foo']]] tm.assert_series_equal(result, expected) result = s.loc[idx[:, ['foo', 'bah']]] tm.assert_series_equal(result, expected) # GH 8737 # empty indexer multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], ['alpha', 'beta'])) df = DataFrame( np.random.randn(5, 6), index=range(5), columns=multi_index) df = df.sort_index(level=0, axis=1) expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) result1 = df.loc[:, ([], slice(None))] result2 = df.loc[:, (['foo'], [])] tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) # regression from < 0.14.0 # GH 7914 df = DataFrame([[np.mean, np.median], ['mean', 'median']], columns=MultiIndex.from_tuples([('functs', 'mean'), ('functs', 'median')]), index=['function', 'name']) result = df.loc['function', ('functs', 'mean')] assert result == np.mean
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( 'C', 4), _mklbl('D', 2)]) df = DataFrame(np.arange(len(ix.get_values())), index=ix) result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] tm.assert_frame_equal(result, expected) expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C2' or c == 'C3')]] result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A', 1), ('A', 2), ('A', 3), ('B', 1)], names=['one', 'two']) columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) df = DataFrame( np.arange(16, dtype='int64').reshape( 4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ['foo'])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ['foo'])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc['A', 'a'] expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name='two'), columns=Index(['bar', 'foo'], name='lvl1')) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.get_values())), index=ix) result = s.loc['A1':'A3', :, ['C1', 'C3']] expected = s.loc[[tuple([a, b, c, d]) for a, b, c, d in s.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) def f(): df.loc[(slice(None), np.array([True, False])), :] pytest.raises(ValueError, f) # ambiguous cases # these can be multiply interpreted (e.g. in this case # as df.loc[slice(None),[1]] as well pytest.raises(KeyError, lambda: df.loc[slice(None), [1]]) result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted assert df.index.lexsort_depth == 2 df = df.sort_index(level=1, axis=0) assert df.index.lexsort_depth == 0 with tm.assert_raises_regex( UnsortedIndexError, 'MultiIndex slicing requires the index to be ' r'lexsorted: slicing on levels \[1\], lexsort depth 0'): df.loc[(slice(None), slice('bar')), :] # GH 16734: not sorted, but no real slicing result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :])
) format = lambda x: '%.2f' % x print(df['e'].map(format)) s1 = df['e'].map(format) print(s1.sort_index()) df2 = DataFrame( np.arange(8).reshape(2, 4), columns=['d', 'a', 'b', 'c'], index=['three', 'one'], ) print(df2) print(df2.sort_index()) # row를 기준으로 정렬 print(df2.sort_index(axis=1)) # column 기준으로 정렬 # 데이터(색인과 값)는 기본적으로 오름차순으로 정렬이 된다. # 내림차순으로 정렬을 할 때는 ascending=False 해준다. print(df.sort_index(axis=1, ascending=False)) # 객체를 값에 따라 정렬할 경우에는 sort_values 메서드를 사용한다. obj = Series([4, 7, -3, 1]) print(obj) print(obj.sort_values()) obj2 = Series([4, np.nan, 8, np.nan, -10, 2]) print(obj2) print(obj2.sort_values()) # NaN 값은 정렬시 가장 마지막에 위치한다.
def test_per_axis_per_level_setitem(self): # test index maker idx = pd.IndexSlice # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A', 1), ('A', 2), ('A', 3), ('B', 1)], names=['one', 'two']) columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) df_orig = DataFrame( np.arange(16, dtype='int64').reshape( 4, 4), index=index, columns=columns) df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity df = df_orig.copy() df.loc[(slice(None), slice(None)), :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) # index df = df_orig.copy() df.loc[(slice(None), [1]), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, 1] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) # columns df = df_orig.copy() df.loc[:, (slice(None), ['foo'])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 tm.assert_frame_equal(df, expected) # both df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, 1], idx[:, ['foo']]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc['A', 'a'] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( [[100, 100], [100, 100]], dtype='int64') expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) # not enough values df = df_orig.copy() def f(): df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( [[100], [100, 100]], dtype='int64') pytest.raises(ValueError, f) def f(): df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( [100, 100, 100, 100], dtype='int64') pytest.raises(ValueError, f) # with an alignable rhs df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( None), 1), (slice(None), ['foo'])] * 5 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( None), 1), (slice(None), ['foo'])] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() rhs.loc[:, ('c', 'bah')] = 10 df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected)
def test_changes_length_raises(self): df = DataFrame({"A": [1, 2, 3]}) with pytest.raises(ValueError, match="change the shape"): df.sort_index(key=lambda x: x[:1])
class predict(object): def __init__(self): self.__pwd = self.__getExePath() self.__predicted_path = self.__pwd +'output' + os.path.sep self.__predicted_picture_ok_path = self.__pwd +'output' + os.path.sep + 'picture' + os.path.sep + 'ok' + os.path.sep self.__predicted_picture_notok_path = self.__pwd +'output' + os.path.sep + 'picture' + os.path.sep + 'notok' + os.path.sep self.__data_basic_path = self.__pwd +'basic_data' + os.path.sep self.__my_pred_file_workday = self.__predicted_path + 'predicted_last_workday.csv' self.__my_pred_file_holiday = self.__predicted_path + 'predicted_last_holiday.csv' self.zhibiao_need = DataFrame() def __getExePath(self): sap = '/' if sys.argv[0].find(sap) == -1: sap = '\\' indx = sys.argv[0].rfind(sap) path = sys.argv[0][:indx] + sap return path def __str2time_1(self, list_in): date_list_new = list() for time_id in list_in: date_new = datetime.datetime.strptime(time_id,'%Y/%m/%d/%H') date_new = date_new.strftime('%Y-%m-%d %H:%M:%S.000') date_list_new.append(date_new) return date_list_new def __date_list_1(self, zhibiao): d_year = zhibiao['year'] d_month = zhibiao['month'] d_day = zhibiao['day'] d_hour = zhibiao['hour'] date_len = len(d_year) date_haha = range(date_len) date_list1 = list(map(lambda i :str(d_year[i])+ '/' + str(d_month[i]) + '/' + str(d_day[i])+ '/' + str(d_hour[i]) , date_haha)) date_list_out = self.__str2time_1(date_list1) return date_list_out def __str2time(self, list_in): date_list_new = list() for time_id in list_in: date_new = datetime.datetime.strptime(time_id,'%Y/%m/%d') date_new = date_new.strftime('%Y-%m-%d %H:%M:%S.000') date_list_new.append(date_new) return date_list_new def __date_list(self, zhibiao): d_year = zhibiao['year'] d_month = zhibiao['month'] d_day = zhibiao['day'] date_len = len(d_year) date_haha = range(date_len) date_list1 = list(map(lambda i :str(d_year[i])+ '/' + str(d_month[i]) + '/' + str(d_day[i]) , date_haha)) date_list_out = self.__str2time(date_list1) return date_list_out def __isworkday(self, data): date_thisday = data[0] if date_thisday in date_thisday: return True else: return False def __ratio_mark(self, data_input, standard): data_input_list = DataFrame(data_input, columns=['flow']) mark_1 = abs(data_input_list.where(data_input_list['flow']<standard)/standard) mark_out = mark_1.fillna(1) return mark_out def __del_file(self, path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): self.__del_file(c_path) else: os.remove(c_path) def __not_ok(self, data_input, lastday_input, data_input_ratio, net_item, flow_standard, alert_standard): ratio_need = data_input_ratio[['UE', 'erab', 'handover', 'rrc']] ratio_need_columns = ratio_need.columns ratio_out = DataFrame() for columns_need in ratio_need_columns: ratio_item = ratio_need[columns_need] ratio_isok = list(map(lambda y:0 if y>alert_standard else 1, ratio_item)) ratio_out[columns_need] = ratio_isok flow_ratio = DataFrame(data_input_ratio['flow'],columns=['flow']) flow_ratio.index = range(len(flow_ratio)) flow_data_pred = data_input['flow'] flow_data_pred.index = range(len(flow_data_pred)) last_flow = lastday_input['flow'] flow_mark = self.__ratio_mark(flow_data_pred, flow_standard)# 超过3G则不考虑比例转换 flow_ratio_out = DataFrame(np.array(flow_ratio)*np.array(flow_mark), columns=['flow']) flow_ratio_out_series = flow_ratio_out['flow'] flow_isok = list(map(lambda y:0 if y>alert_standard else 1, flow_ratio_out_series)) ratio_out['flow'] = flow_isok notok_hours = 24 - ratio_out.sum(axis=0) if notok_hours['flow'] > 3: fig = plt.figure(figsize=(6, 3)) ax = fig.add_subplot(111) ax.plot(range(24), flow_data_pred, label="flow_pred", color="g") ax.plot(range(24), last_flow, label="flow_lastday", color="r") ax.set_ylabel('GB') ax.set_xlabel('Hour') plt.legend(loc="upper left") plt.title('net_num: ' + net_item + ' flow lastday') plt.savefig(self.__predicted_picture_notok_path + net_item +'.png') else: fig = plt.figure(figsize=(6, 3)) ax = fig.add_subplot(111) ax.plot(range(24), flow_data_pred, label="flow_pred", color="g") ax.plot(range(24), last_flow, label="flow_lastday", color="r") ax.set_ylabel('GB') ax.set_xlabel('Hour') plt.legend(loc="upper left") plt.title('net_num: ' + net_item + ' flow lastday') plt.savefig(self.__predicted_picture_ok_path + net_item +'.png') return ratio_out @jit def tensor_flow_go(self, net_list): predicted_last = pd.DataFrame(columns=['UE', 'erab', 'flow', 'handover', 'rrc', 'net_num']) for cell in net_list: y_zhunbei = self.zhibiao_need[self.zhibiao_need.net_num == cell][['time', 'UE', 'erab', 'flow', 'handover', 'rrc']] y = numpy.array(y_zhunbei[['UE', 'erab', 'flow', 'handover', 'rrc']]) x = np.array(range(len(y))) data = {tf.contrib.timeseries.TrainEvalFeatures.TIMES: x, tf.contrib.timeseries.TrainEvalFeatures.VALUES: y,} reader = NumpyReader(data) train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(reader, batch_size=10, window_size=48) ar = tf.contrib.timeseries.ARRegressor( periodicities=48, input_window_size=24, output_window_size=24, num_features=5, loss=tf.contrib.timeseries.ARModel.NORMAL_LIKELIHOOD_LOSS) ar.train(input_fn=train_input_fn, steps=700) evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader) # keys of evaluation: ['covariance', 'loss', 'mean', 'observed', 'start_tuple', 'times', 'global_step'] evaluation = ar.evaluate(input_fn=evaluation_input_fn, steps=1) (predictions,) = tuple(ar.predict( input_fn=tf.contrib.timeseries.predict_continuation_input_fn( evaluation, steps=24))) prediction = predictions['mean'] predicted_out = DataFrame(prediction) predicted_out.columns = ['UE', 'erab', 'flow', 'handover', 'rrc'] predicted_out['net_num'] = cell predicted_last = predicted_last.append(predicted_out) def predicter_tenorflow(self): tf.logging.set_verbosity(tf.logging.INFO) conn = pymysql.connect("localhost", "root", "1234", "python_test", charset='utf8' ) cursor = conn.cursor() cursor.execute("select * from python_test.2018workday") canlender_2018 = cursor.fetchall() workday_calender = DataFrame(list(canlender_2018), columns=['date']) cursor.close conn.close workday_2018 = self.__str2time(list(workday_calender['date'])) #zhibiao_oneday = pd.read_csv(self.__pwd + 'today.csv') zhibiao_oneday = pd.read_csv('F:/work/pyqt/predicter/today.csv') data_1 = self.__date_list(zhibiao_oneday) is_workday = self.__isworkday(data_1) print(is_workday) conn = pymysql.connect("localhost", "root", "1234", "python_test", charset='utf8' ) cursor = conn.cursor() engine = create_engine('mysql+pymysql://root:[email protected]:3306/python_test?charset=utf8') if is_workday: #从数据库读取workday cursor.execute("select * from python_test.workday") work_day = cursor.fetchall() zhibiao = DataFrame(list(work_day), columns=['net_num', 'year', 'month', 'day','hour', 'UE', 'erab', 'flow', 'handover', 'rrc']) cursor.execute("drop table python_test.workday") zhibiao1 = zhibiao.append(zhibiao_oneday) zhibiao2 = zhibiao1.drop_duplicates(['net_num', 'year', 'month', 'day','hour']) zhibiao2.to_sql('workday',con=engine, schema='python_test', index=False, index_label=False, if_exists='append', chunksize=1000) conn.commit() else: #从数据库读取holiday cursor.execute("select * from python_test.holiday") holi_day = cursor.fetchall() zhibiao = DataFrame(list(holi_day), columns=['net_num', 'year', 'month', 'day','hour', 'UE', 'erab', 'flow', 'handover', 'rrc']) cursor.execute("drop table python_test.holiday") zhibiao1 = zhibiao.append(zhibiao_oneday) zhibiao2 = zhibiao1.drop_duplicates(['net_num', 'year', 'month', 'day','hour']) zhibiao2.to_sql('holiday',con=engine, schema='python_test', index=False, index_label=False, if_exists='append', chunksize=1000) conn.commit() cursor.close() conn.close() date_index = self.__date_list_1(zhibiao) zhibiao.index = date_index zhibiao['time'] = date_index #zhibiao1.to_csv("F:/work/tianhe4location/test/wori.csv") date_index_list = sorted(list(set(list(date_index))), reverse=True) date_len = len(date_index_list)/24 if date_len <= 30: zhibiao_need_index = date_index_list else: zhibiao_need_index = date_index_list[:60*24] self.zhibiao_need = zhibiao.ix[zhibiao_need_index] self.zhibiao_need = self.zhibiao_need.sort_index() net_name = set(list(self.zhibiao_need.net_num)) net_name_all = list(net_name) #print(net_name_all) net_num_perg = round(len(net_name)/8) net_list_index = list() for i in range(8): if i == 7: net_list_i = net_name_all[i*net_num_perg:len(net_name)-1] else: net_list_i = net_name_all[i*net_num_perg:(i+1)*net_num_perg] net_list_index.append(net_list_i) predicted_all_last = pd.DataFrame(columns=['UE', 'erab', 'flow', 'handover', 'rrc', 'net_num']) #print(net_list_index) start = time.time() for net_list_index_processing in net_list_index: self.tensor_flow_go(net_list_index_processing) end = time.time() print(end-start) #xx =net_list_index[0] #self.tensor_flow_go(xx)
print(fund_trade) #取出fund_trade中三列,分别为:交易日期,基金代码,收盘价,并且以交易代码为索引。 F_CLOSE=DataFrame(fund_trade[['F_TRADE_DATE','F_FUND_CODE','F_CLOSE']]) FC=F_CLOSE #布尔型索引,根据基金代码列是否为制定基金代码,选择该基金的全部收盘价 codelist=(165516,161714) FC[FC['F_FUND_CODE']==codelist] F_CLOSE_CHOOSEN=DataFrame(FC[FC['F_FUND_CODE']==code for code in codelist) FCC=F_CLOSE_CHOOSEN FC ###20-23 想获取两只基金的历史收盘价???? print(FC) for code in codelist: print(code) F_CLOSE_CHOOSEN={} for code in codelist: F_CLOSE_CHOOSEN=DataFrame(F_CLOSE[F_CLOSE['F_FUND_CODE']==code]) A=F_CLOSE_CHOOSEN.sort_index(by='F_TRADE_DATE') A F_CLOSE_CHOOSEN #根据交易日期列进行排序 F_CLOSE_CHOOSEN.sort_index(by='F_TRADE_DATE') F_CLOSE['F_FUND_CODE'==165516] F_CLOSE.ix[510220] CODE_LIST=(510220,165516) for code in CODE_LIST: print(code) F_CLOSE_CHOOSE=DataFrame(F_CLOSE[F_CLOSE['F_FUND_CODE']==[code for code in CODE_LIST]])
#Create new row empDf.append(Series([5, False, 'Derek', 2], index=['id', 'isManager', 'name', 'deptId']), ignore_index=True) #Delete a column empDf['dummy'] = 1 empDf del empDf['dummy'] empDf #Delete a row empDf.drop(1) #Sort a Data Frame empDf.sort_index(axis=1) empDf.sort(['isManager', 'name']) empDf.describe() empDf.id.corr(empDf.deptId) #Iterate through a DataFrame for rowNum, row in auto_data.iterrows(): for colName, col in row.iteritems(): #if pd.isnull(col) : print(pd.isnull(col), rowNum, colName) #---------------------------------------------------------------------------- # Data Operations #----------------------------------------------------------------------------
def test_timegrouper_with_reg_groups(self): # GH 3794 # allow combinateion of timegrouper/reg groups df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 1, 1, 13, 0), datetime(2013, 1, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 12, 2, 14, 0), ] }).set_index('Date') df_sorted = df_original.sort_values(by='Quantity', ascending=False) for df in [df_original, df_sorted]: expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() assert_frame_equal(result, expected) expected = DataFrame({ 'Buyer': 'Carl Mark Carl Joe'.split(), 'Quantity': [1, 3, 9, 18], 'Date': [ datetime(2013, 1, 1, 0, 0), datetime(2013, 1, 1, 0, 0), datetime(2013, 7, 1, 0, 0), datetime(2013, 7, 1, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() assert_frame_equal(result, expected) df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 10, 1, 13, 0), datetime(2013, 10, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 10, 2, 14, 0), ] }).set_index('Date') df_sorted = df_original.sort_values(by='Quantity', ascending=False) for df in [df_original, df_sorted]: expected = DataFrame({ 'Buyer': 'Carl Joe Mark Carl Joe'.split(), 'Quantity': [6, 8, 3, 4, 10], 'Date': [ datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 2, 0, 0), datetime(2013, 10, 2, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), ] }).set_index(['Date', 'Buyer']) assert_frame_equal(result, expected) # passing the name df = df.reset_index() result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' ]).sum() assert_frame_equal(result, expected) with pytest.raises(KeyError): df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() # passing the level df = df.set_index('Date') result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer' ]).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum( ) assert_frame_equal(result, expected) with pytest.raises(ValueError): df.groupby([pd.Grouper(freq='1M', level='foo'), 'Buyer']).sum() # multi names df = df.copy() df['Date'] = df.index + pd.offsets.MonthEnd(2) result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' ]).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), ] }).set_index(['Date', 'Buyer']) assert_frame_equal(result, expected) # error as we have both a level and a name! with pytest.raises(ValueError): df.groupby([pd.Grouper(freq='1M', key='Date', level='Date'), 'Buyer']).sum() # single groupers expected = DataFrame({'Quantity': [31], 'Date': [datetime(2013, 10, 31, 0, 0) ]}).set_index('Date') result = df.groupby(pd.Grouper(freq='1M')).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M')]).sum() assert_frame_equal(result, expected) expected = DataFrame({'Quantity': [31], 'Date': [datetime(2013, 11, 30, 0, 0) ]}).set_index('Date') result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() assert_frame_equal(result, expected) # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') for freq in ['D', 'M', 'A', 'Q-APR']: expected = df.groupby('user_id')[ 'whole_cost'].resample( freq).sum().dropna().reorder_levels( ['date', 'user_id']).sort_index().astype('int64') expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected)
def test_sort_values_key_empty(self, sort_by_key): df = DataFrame(np.array([])) df.sort_values(0, key=sort_by_key) df.sort_index(key=sort_by_key)
#trade_type.head(20).iloc[:,:5] #trade_type=pd.read_pickle(data_path+'/trade_type_16_100w') tmp = trade_type.copy() tmp.index.names = ['trddt', 'trader', 'action'] #tmp=tmp.groupby(by='trddt').apply(lambda x:x/x.sum());tmp #tmp.index.names=['stkcd'] select_stock = DataFrame(select_stock) select_stock.columns.get_level_values(0) #iloc[:,0].name[0] Group_By = DataFrame(np.nan, index=tmp.index, columns=select_stock.columns.get_level_values(0)) # for c in Group_By.columns: Group_By[c] = tmp[select_stock[(c, '2015-02-25')]].mean(axis=1) G1 = Group_By.sort_index().loc(axis=0)[:'20150217', :, :] G1 G2 = Group_By.sort_index().loc(axis=0)['20150225':, :, :] G2 G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] G2.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] G2.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] - G1.groupby( by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] (G2.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] - G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] ) / G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'].reset_index( level=1, drop=True) - G1.groupby(by=['trader', 'action']).mean().loc( axis=0)[:, 'sell'].reset_index(level=1, drop=True) G1.groupby(by=['trader']).head(
def test_increment_without_metadata_with_schema( self, capsys: CaptureFixture, archive_dir: LocalPath, archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker", schema_file: Optional[LocalPath], verbose: bool, ): # List of (expected frame filename, data filename) tuples targets: List[Tuple[str, str]] = [ ("iris-part-1-of-6-combined.csv", "iris-part-1-of-6-combined.csv"), ("iris-part-1-2.csv", "iris-part-2-of-6-combined.csv"), ("iris-part-1-2-3.csv", "iris-part-3-of-6-combined.csv"), ("iris-part-1-2-3-4.csv", "iris-part-4-of-6-combined.csv"), ("iris-part-1-2-3-4-5.csv", "iris-part-5-of-6-combined.csv"), ("iris_plus.csv", "iris-part-6-of-6-combined.csv"), ] expected_hashfile = ( LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if archive_fixture.hash_file is None else archive_fixture.hash_file) assert not os.path.exists(expected_hashfile) assert not os.path.exists(archive_fixture.cache_file) assert len(archive_dir.listdir()) == 0 expected_schemafile = (archive_dir.join(syphon.schema.DEFAULT_FILE) if schema_file is None else schema_file) assert not os.path.exists(expected_schemafile) syphon.init(SortedDict({ "0": "PetalColor", "1": "Species" }), expected_schemafile) assert os.path.exists(expected_schemafile) for expected_frame_filename, data_filename in targets: assert archive_fixture( archive_dir, [os.path.join(get_data_path(), data_filename)], schema_filepath=schema_file, cache_filepath=archive_fixture.cache_file, hash_filepath=archive_fixture.hash_file, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) expected_frame = DataFrame( read_csv( os.path.join(get_data_path(), expected_frame_filename), dtype=str, index_col="Index", )) expected_frame.sort_index(inplace=True) actual_frame = DataFrame( read_csv(str(archive_fixture.cache_file), dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_captured_outerr(capsys.readouterr(), False, False) assert_frame_equal(expected_frame, actual_frame) assert os.path.exists(expected_hashfile) assert syphon.check( archive_fixture.cache_file, hash_filepath=expected_hashfile, verbose=verbose, )
ret_versus_prob, axis=0) sim += 1 elapsed_time = time.time() - start_time print('Tempo de simulação:', elapsed_time) return performance_modelo, ret_medio_ibov_sim, ret_medio_port_sim, ret_medio_port_long_sim, ret_medio_port_short_sim, corr_prob_ret_sim, datas_teste_sim # carrega do IBOVESPA e dados históricos compomentes = ler_base_componetes() base_total = carrega_dados() datas = DataFrame(base_total['data'].drop_duplicates().values, columns=['data']) datas = datas.set_index(['data']) datas = datas.sort_index(axis=0) limit_inf = '19990202 18:00:000' limit_sup = '20171230 18:00:000' datas = datas.loc[limit_inf:limit_sup] datas = datas.sort_index(axis=0) datas = datas.reset_index(['data']) # lista de variáveis para o modelo (features) e para aplicação de logaritmo (cols) #features = ['data', 'codigo', 'retorno', 'acao_close', 'roe', 'pl', 'irf', 'sharpe', 'petroleo_close', 'dolar_close','dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close'] #cols = ['acao_close', 'petroleo_close', 'dolar_close', 'dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close'] features = [ 'data', 'codigo', 'retorno', 'acao_close', 'roe', 'sharpe', 'dolar_close', 'sp500_close', 'ibov_fut_close' ] cols = ['acao_close', 'dolar_close', 'sp500_close', 'ibov_fut_close']
sort_Ser = unsort_Ser.sort_index() print(sort_Ser) print(sort_Ser.sort_index(ascending=False)) # 当然也可以按值进行排序 print(unsort_Ser.sort_values()) print(unsort_Ser.sort_values(ascending=False)) # 对于DataFrame数据结构,可以指定某个列或者某些列为index,继而对整个DataFrame进行排序 # 我们再回忆下DataFrame的复杂字典构造方法 temp_df = DataFrame({ 'Stu_name': ['Justin', 'Hux', 'Jacob', 'Steve'], 'Math_result': [99, 88, 100, 20] }) print(temp_df.sort_index(by='Math_result', ascending=False)) # 我们实验下能否使用sort_values() print(temp_df.sort_values('Math_result', ascending=False)) # 我们发现二者的功能是相同的 # 那么能否同时按照两排的元素进行复合排序呢 print(temp_df.sort_index(by=['Math_result', 'Stu_name'], ascending=False)) # 但是当我们采用sort_value进行复合排序时就有一些捉襟见肘 # 在介绍完排序后,我们介绍下排名,即求一个Series中元素的秩 temp_Ser = Series([23, 12, -90, 24, 32, 7, 13]) print(temp_Ser.rank()) # 我们将值和秩排到一起 rank = temp_Ser.rank() temp_df = DataFrame({'Value': temp_Ser.item, 'Rank': rank})
def f(x): return Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) format = lambda x: '%.2f' % x frame.applymap(format) frame['e'].map(format) #Sorting and ranking obj = Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame.sort_index() frame.sort_index(axis=1) obj = Series([4, 7, -3, 2]) obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) obj.order() frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame.sort_index(by='b') #ranking obj = Series([7, -5, 7, 4, 2, 0, 4]) obj.rank()
def test_sort_index_nan_multiindex(self): # GH#14784 # incorrect sorting w.r.t. nans tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) df2 = DataFrame( { "date": pd.DatetimeIndex( [ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ] ), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, np.nan, 280, 259, np.nan, 623, 90, 312, np.nan, 301, 359, 801, ], "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], } ).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position last result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) # sorting frame with removed rows result = df2.dropna().sort_index() expected = df2.sort_index().dropna() tm.assert_frame_equal(result, expected) # sorting series, default nan position is last result = s.sort_index() expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position last result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected)
def _write_glue_preds( task_name: str, preds_df: pd.DataFrame, pred_dir: str, split_name: str, strict_glue_format: bool = False, ): """ Write predictions to separate files located in pred_dir. We write special code to handle various GLUE tasks. Use strict_glue_format to guarantee compatibility with GLUE website. Args: task_name: task name preds_df: predictions DataFrame for a single task, as returned by evaluate(). pred_dir: directory to write predictions split_name: name of this split ('train', 'val', or 'test') strict_glue_format: if true, writes format compatible with GLUE website. """ def _apply_pred_map(preds_df, pred_map, key="prediction"): """ Apply preds_map, in-place. """ preds_df[key] = [pred_map[p] for p in preds_df[key]] def _write_preds_with_pd(preds_df: pd.DataFrame, pred_file: str, write_type=int): """ Write TSV file in GLUE format, using Pandas. """ required_cols = ["index", "prediction"] if strict_glue_format: cols_to_write = required_cols quoting = QUOTE_NONE log.info( "Task '%s', split '%s': writing %s in " "strict GLUE format.", task_name, split_name, pred_file, ) else: all_cols = set(preds_df.columns) # make sure we write index and prediction as first columns, # then all the other ones we can find. cols_to_write = required_cols + sorted( list(all_cols.difference(required_cols))) quoting = QUOTE_MINIMAL preds_df.to_csv( pred_file, sep="\t", index=False, float_format="%.3f", quoting=quoting, columns=cols_to_write, ) if len(preds_df) == 0: # catch empty lists log.warning("Task '%s': predictions are empty!", task_name) return def _add_default_column(df, name: str, val): """ Ensure column exists and missing values = val. """ if name not in df: df[name] = val df[name].fillna(value=val, inplace=True) preds_df = preds_df.copy() _add_default_column(preds_df, "idx", -1) _add_default_column(preds_df, "sent1_str", "") _add_default_column(preds_df, "sent2_str", "") _add_default_column(preds_df, "labels", -1) # Rename columns to match output headers. preds_df.rename( { "idx": "index", "preds": "prediction", "sent1_str": "sentence_1", "sent2_str": "sentence_2", "labels": "true_label", }, axis="columns", inplace=True, ) if task_name == "mnli" and split_name == "test": # 9796 + 9847 = 19643 assert len(preds_df) == 19643, "Missing predictions for MNLI!" log.info("There are %d examples in MNLI, 19643 were expected", len(preds_df)) # Sort back to original order to split matched and mismatched, which are # treated as a single dataset by jiant. preds_df.sort_index(inplace=True) pred_map = {0: "neutral", 1: "entailment", 2: "contradiction"} _apply_pred_map(preds_df, pred_map, "prediction") _write_preds_with_pd( preds_df.iloc[:9796], _get_pred_filename("mnli-m", pred_dir, split_name, strict_glue_format), ) _write_preds_with_pd( preds_df.iloc[9796:], _get_pred_filename("mnli-mm", pred_dir, split_name, strict_glue_format), ) elif task_name in ["rte", "qnli"]: pred_map = {0: "not_entailment", 1: "entailment"} _apply_pred_map(preds_df, pred_map, "prediction") _write_preds_with_pd( preds_df, _get_pred_filename(task_name, pred_dir, split_name, strict_glue_format)) elif task_name in ["sts-b"]: preds_df["prediction"] = [ min(max(0.0, pred * 5.0), 5.0) for pred in preds_df["prediction"] ] _write_preds_with_pd( preds_df, _get_pred_filename(task_name, pred_dir, split_name, strict_glue_format), write_type=float, ) elif task_name in ["wmt"]: # convert each prediction to a single string if we find a list of # tokens if isinstance(preds_df["prediction"][0], list): assert isinstance(preds_df["prediction"][0][0], str) preds_df["prediction"] = [ " ".join(pred) for pred in preds_df["prediction"] ] _write_preds_with_pd( preds_df, _get_pred_filename(task_name, pred_dir, split_name, strict_glue_format), write_type=str, ) else: _write_preds_with_pd( preds_df, _get_pred_filename(task_name, pred_dir, split_name, strict_glue_format), write_type=int, ) log.info("Wrote predictions for task: %s", task_name)
frame.apply(f, axis=1) # Aplicar la función al DataFrame completo -------------------------------------------- format = lambda x: '%.2f' % x frame.applymap(format) frame.applymap(lambda x: x+10) # Ordenacion -------------------------------------------- obj = Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() obj.sort_index(ascending=False) frame = DataFrame(np.arange(12).reshape((3, 4)),index=['three', 'one', 'two'], columns=['d', 'a', 'b', 'c']) frame.sort_index() frame.sort_index(axis=1) frame.sort_index(axis=1, ascending=False) frame.sort_index(by='b', ascending=False) # Información resumida y estadística -------------------------------------------- # Por defecto los valores NaN son omitidos en los calculos df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan,np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() df.sum(axis=1) df.mean(axis=1) df.mean(axis=1, skipna=False) # Las filas con algún NaN darán como suma NaN
def test_sort_index_duplicates(self): # with 9816, these are all translated to .sort_values df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"]) with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by="a") with pytest.raises(ValueError, match="not unique"): df.sort_values(by="a") with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=["a"]) with pytest.raises(ValueError, match="not unique"): df.sort_values(by=["a"]) with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): # multi-column 'by' is separate codepath df.sort_index(by=["a", "b"]) with pytest.raises(ValueError, match="not unique"): # multi-column 'by' is separate codepath df.sort_values(by=["a", "b"]) # with multi-index # GH4370 df = DataFrame(np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)])) with pytest.raises(ValueError, match="level"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by="a") with pytest.raises(ValueError, match="level"): df.sort_values(by="a") # convert tuples to a list of tuples # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=[("a", 1)]) expected = df.sort_values(by=[("a", 1)]) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=("a", 1)) result = df.sort_values(by=("a", 1)) assert_frame_equal(result, expected)
def test_per_axis_per_level_doc_examples(self): # test index maker idx = pd.IndexSlice # from indexing.rst / advanced index = MultiIndex.from_product( [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df = DataFrame( np.arange(len(index) * len(columns), dtype="int64").reshape( (len(index), len(columns))), index=index, columns=columns, ) result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if ( a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]] tm.assert_frame_equal(result, expected) result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :] expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if (c == "C1" or c == "C3")]] tm.assert_frame_equal(result, expected) result = df.loc[idx[:, :, ["C1", "C3"]], :] tm.assert_frame_equal(result, expected) # not sorted msg = ("MultiIndex slicing requires the index to be lexsorted: " r"slicing on levels \[1\], lexsort depth 1") with pytest.raises(UnsortedIndexError, match=msg): df.loc["A1", ("a", slice("foo"))] # GH 16734: not sorted, but no real slicing tm.assert_frame_equal(df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]]) df = df.sort_index(axis=1) # slicing df.loc["A1", (slice(None), "foo")] df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")] # setitem df.loc(axis=0)[:, :, ["C1", "C3"]] = -10
def aggr(func): f = lambda a: np.fromiter(map(func, a), dtype='f8') arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=['jim', 'joe'], index=mi) return res.sort_index()
def test_sort_nan(self): # GH3917 nan = np.nan df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], 'B': [9, nan, 5, 2, 5, 4, 5]}) # sort one column only expected = DataFrame( {'A': [nan, 1, 1, 2, 4, 6, 8], 'B': [5, 9, 2, nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5]) sorted_df = df.sort_values(['A'], na_position='first') assert_frame_equal(sorted_df, expected) expected = DataFrame( {'A': [nan, 8, 6, 4, 2, 1, 1], 'B': [5, 4, 5, 5, nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3]) sorted_df = df.sort_values(['A'], na_position='first', ascending=False) assert_frame_equal(sorted_df, expected) expected = df.reindex(columns=['B', 'A']) sorted_df = df.sort_values(by=1, axis=1, na_position='first') assert_frame_equal(sorted_df, expected) # na_position='last', order expected = DataFrame( {'A': [1, 1, 2, 4, 6, 8, nan], 'B': [2, 9, nan, 5, 5, 4, 5]}, index=[3, 0, 1, 6, 4, 5, 2]) sorted_df = df.sort_values(['A', 'B']) assert_frame_equal(sorted_df, expected) # na_position='first', order expected = DataFrame( {'A': [nan, 1, 1, 2, 4, 6, 8], 'B': [5, 2, 9, nan, 5, 5, 4]}, index=[2, 3, 0, 1, 6, 4, 5]) sorted_df = df.sort_values(['A', 'B'], na_position='first') assert_frame_equal(sorted_df, expected) # na_position='first', not order expected = DataFrame( {'A': [nan, 1, 1, 2, 4, 6, 8], 'B': [5, 9, 2, nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5]) sorted_df = df.sort_values(['A', 'B'], ascending=[ 1, 0], na_position='first') assert_frame_equal(sorted_df, expected) # na_position='last', not order expected = DataFrame( {'A': [8, 6, 4, 2, 1, 1, nan], 'B': [4, 5, 5, nan, 2, 9, 5]}, index=[5, 4, 6, 1, 3, 0, 2]) sorted_df = df.sort_values(['A', 'B'], ascending=[ 0, 1], na_position='last') assert_frame_equal(sorted_df, expected) # Test DataFrame with nan label df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], 'B': [9, nan, 5, 2, 5, 4, 5]}, index=[1, 2, 3, 4, 5, 6, nan]) # NaN label, ascending=True, na_position='last' sorted_df = df.sort_index( kind='quicksort', ascending=True, na_position='last') expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], 'B': [9, nan, 5, 2, 5, 4, 5]}, index=[1, 2, 3, 4, 5, 6, nan]) assert_frame_equal(sorted_df, expected) # NaN label, ascending=True, na_position='first' sorted_df = df.sort_index(na_position='first') expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8], 'B': [5, 9, nan, 5, 2, 5, 4]}, index=[nan, 1, 2, 3, 4, 5, 6]) assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='last' sorted_df = df.sort_index(kind='quicksort', ascending=False) expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4], 'B': [4, 5, 2, 5, nan, 9, 5]}, index=[6, 5, 4, 3, 2, 1, nan]) assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='first' sorted_df = df.sort_index( kind='quicksort', ascending=False, na_position='first') expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1], 'B': [5, 4, 5, 2, 5, nan, 9]}, index=[nan, 6, 5, 4, 3, 2, 1]) assert_frame_equal(sorted_df, expected)
tuples) #multiindexing the columns with tuple values df1.index.names = iname #naming the index df1.dropna(inplace=True) #dropping null vvalues df1.head(10) #reshaping the table df1 = df1.stack() df1 = df1.stack() df1 = df1.stack() df1 = DataFrame(df1, columns=['NUMBER']) #naming the value column df1 = df1.swaplevel(-3, -1, axis=0) #ordering the indexes df1.sort_index(level=-4, axis=0, ascending=True, inplace=True) #sorting the index df1.index.rename(['Total', 'Different Residence', 'Different County'], level=[-3, -2, -1], inplace=True) #renaming the indexes which left unnamed df1.head(20) #repeating all the steps for df2 cname5 = df2.iloc[0, :1] df2.drop([86, 87], axis=0, inplace=True) df2.drop([10], axis=1, inplace=True) df2.set_index([0], inplace=True) df2.columns = pd.MultiIndex.from_tuples(tuples)