Example #1
0
def align_partial_results(
    df: pd.DataFrame,
    progr_key: str,  # progression key
    metrics: List[str],
    interpolation: str = "slinear",
    # TODO: Allow normalizing progr_key (e.g. subtract min time stamp)
) -> Tuple[Dict[str, pd.DataFrame], Dict[str, pd.DataFrame]]:
    """Helper function to align partial results with heterogeneous index

    Args:
        df: The DataFrame containining the raw data (in long format).
        progr_key: The key of the column indexing progression (such as
            the number of training examples, timestamps, etc.).
        metrics: The names of the metrics to consider.
        interpolation: The interpolation method used to fill missing values
            (if applicable). See `pandas.DataFrame.interpolate` for
            available options.

    Returns:
        A two-tuple containig a dict mapping the provided metric names to the
        index-normalized and interpolated mean (sem).
    """
    missing_metrics = set(metrics) - set(df["metric_name"])
    if missing_metrics:
        raise ValueError(
            f"Metrics {missing_metrics} not found in input dataframe")
    # select relevant metrics
    df = df[df["metric_name"].isin(metrics)]
    # drop arm names (assumes 1:1 map between trial indices and arm names)
    df = df.drop("arm_name", axis=1)
    # set multi-index over trial, metric, and progression key
    df = df.set_index(["trial_index", "metric_name", progr_key])
    # sort index
    df = df.sort_index(level=["trial_index", progr_key])
    # drop sem if all NaN (assumes presence of sem column)
    has_sem = not df["sem"].isnull().all()
    if not has_sem:
        df = df.drop("sem", axis=1)
    # create the common index that every map result will be re-indexed w.r.t.
    index_union = df.index.levels[2].unique()
    # loop through (trial, metric) combos and align data
    dfs_mean = defaultdict(list)
    dfs_sem = defaultdict(list)
    for tidx in df.index.levels[
            0]:  # this could be slow if there are many trials
        for metric in df.index.levels[1]:
            # grab trial+metric sub-df and reindex to common index
            df_ridx = df.loc[(tidx, metric)].reindex(index_union)
            # interpolate / fill missing results (only fills in between points,
            # does not extrapolate)
            # TODO: Allow passing of additional kwargs to `interpolate`
            # TODO: Allow using an arbitrary prediction model for this instead
            try:
                df_interp = df_ridx.interpolate(method=interpolation,
                                                limit_area="inside")
            except ValueError as e:
                df_interp = df_ridx
                logger.info(f"Got exception `{e}` during interpolation. "
                            "Using uninterpolated values instead.")
            # renaming column to trial index, append results
            dfs_mean[metric].append(df_interp["mean"].rename(tidx))
            if has_sem:
                dfs_sem[metric].append(df_interp["sem"].rename(tidx))

    # combine results into output dataframes
    dfs_mean = {
        metric: pd.concat(dfs, axis=1)
        for metric, dfs in dfs_mean.items()
    }
    dfs_sem = {
        metric: pd.concat(dfs, axis=1)
        for metric, dfs in dfs_sem.items()
    }

    return dfs_mean, dfs_sem
Example #2
0
    def test_per_axis_per_level_getitem(self):

        # GH6134
        # example test case
        ix = MultiIndex.from_product(
            [_mklbl("A", 5),
             _mklbl("B", 7),
             _mklbl("C", 4),
             _mklbl("D", 2)])
        df = DataFrame(np.arange(len(ix.to_numpy())), index=ix)

        result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values if (
            a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]]
        tm.assert_frame_equal(result, expected)

        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values
                           if (a == "A1" or a == "A2" or a == "A3") and (
                               c == "C1" or c == "C2" or c == "C3")]]
        result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :]
        tm.assert_frame_equal(result, expected)

        # test multi-index slicing with per axis and per index controls
        index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3),
                                        ("B", 1)],
                                       names=["one", "two"])
        columns = MultiIndex.from_tuples(
            [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
            names=["lvl0", "lvl1"],
        )

        df = DataFrame(np.arange(16, dtype="int64").reshape(4, 4),
                       index=index,
                       columns=columns)
        df = df.sort_index(axis=0).sort_index(axis=1)

        # identity
        result = df.loc[(slice(None), slice(None)), :]
        tm.assert_frame_equal(result, df)
        result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))]
        tm.assert_frame_equal(result, df)
        result = df.loc[:, (slice(None), slice(None))]
        tm.assert_frame_equal(result, df)

        # index
        result = df.loc[(slice(None), [1]), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), 1), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        # columns
        result = df.loc[:, (slice(None), ["foo"])]
        expected = df.iloc[:, [1, 3]]
        tm.assert_frame_equal(result, expected)

        # both
        result = df.loc[(slice(None), 1), (slice(None), ["foo"])]
        expected = df.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.loc["A", "a"]
        expected = DataFrame(
            {
                "bar": [1, 5, 9],
                "foo": [0, 4, 8]
            },
            index=Index([1, 2, 3], name="two"),
            columns=Index(["bar", "foo"], name="lvl1"),
        )
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), [1, 2]), :]
        expected = df.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # multi-level series
        s = Series(np.arange(len(ix.to_numpy())), index=ix)
        result = s.loc["A1":"A3", :, ["C1", "C3"]]
        expected = s.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in s.index.values if (
            a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]]
        tm.assert_series_equal(result, expected)

        # boolean indexers
        result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :]
        expected = df.iloc[[2, 3]]
        tm.assert_frame_equal(result, expected)

        msg = ("cannot index with a boolean indexer "
               "that is not the same length as the index")
        with pytest.raises(ValueError, match=msg):
            df.loc[(slice(None), np.array([True, False])), :]

        with pytest.raises(KeyError, match=r"\[1\] not in index"):
            # slice(None) is on the index, [1] is on the columns, but 1 is
            #  not in the columns, so we raise
            #  This used to treat [1] as positional GH#16396
            df.loc[slice(None), [1]]

        result = df.loc[(slice(None), [1]), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        # not lexsorted
        assert df.index.lexsort_depth == 2
        df = df.sort_index(level=1, axis=0)
        assert df.index.lexsort_depth == 0

        msg = ("MultiIndex slicing requires the index to be "
               r"lexsorted: slicing on levels \[1\], lexsort depth 0")
        with pytest.raises(UnsortedIndexError, match=msg):
            df.loc[(slice(None), slice("bar")), :]

        # GH 16734: not sorted, but no real slicing
        result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :]
        tm.assert_frame_equal(result, df.iloc[[1, 3], :])
Example #3
0
    def test_per_axis_per_level_setitem(self):

        # test index maker
        idx = pd.IndexSlice

        # test multi-index slicing with per axis and per index controls
        index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3),
                                        ("B", 1)],
                                       names=["one", "two"])
        columns = MultiIndex.from_tuples(
            [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
            names=["lvl0", "lvl1"],
        )

        df_orig = DataFrame(np.arange(16, dtype="int64").reshape(4, 4),
                            index=index,
                            columns=columns)
        df_orig = df_orig.sort_index(axis=0).sort_index(axis=1)

        # identity
        df = df_orig.copy()
        df.loc[(slice(None), slice(None)), :] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc(axis=0)[:, :] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[:, (slice(None), slice(None))] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        # index
        df = df_orig.copy()
        df.loc[(slice(None), [1]), :] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), 1), :] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc(axis=0)[:, 1] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # columns
        df = df_orig.copy()
        df.loc[:, (slice(None), ["foo"])] = 100
        expected = df_orig.copy()
        expected.iloc[:, [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # both
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[idx[:, 1], idx[:, ["foo"]]] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc["A", "a"] = 100
        expected = df_orig.copy()
        expected.iloc[0:3, 0:2] = 100
        tm.assert_frame_equal(df, expected)

        # setting with a list-like
        df = df_orig.copy()
        df.loc[(slice(None), 1),
               (slice(None), ["foo"])] = np.array([[100, 100], [100, 100]],
                                                  dtype="int64")
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # not enough values
        df = df_orig.copy()

        msg = "setting an array element with a sequence."
        with pytest.raises(ValueError, match=msg):
            df.loc[(slice(None), 1),
                   (slice(None), ["foo"])] = np.array([[100], [100, 100]],
                                                      dtype="int64")

        msg = "Must have equal len keys and value when setting with an iterable"
        with pytest.raises(ValueError, match=msg):
            df.loc[(slice(None), 1),
                   (slice(None), ["foo"])] = np.array([100, 100, 100, 100],
                                                      dtype="int64")

        # with an alignable rhs
        df = df_orig.copy()
        df.loc[(slice(None), 1),
               (slice(None), ["foo"])] = (df.loc[(slice(None), 1),
                                                 (slice(None), ["foo"])] * 5)
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), 1),
               (slice(None), ["foo"])] *= df.loc[(slice(None), 1),
                                                 (slice(None), ["foo"])]
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(df, expected)

        rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy()
        rhs.loc[:, ("c", "bah")] = 10
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(df, expected)
Example #4
0
    def test_sort_index_and_reconstruction(self):

        # GH#15622
        # lexsortedness should be identical
        # across MultiIndex construction methods

        df = DataFrame([[1, 1], [2, 2]], index=list("ab"))
        expected = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex.from_tuples(
                [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]
            ),
        )
        assert expected.index.is_lexsorted()

        result = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
        )
        result = result.sort_index()
        assert result.index.is_lexsorted()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        result = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex(
                levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
            ),
        )
        result = result.sort_index()
        assert result.index.is_lexsorted()

        tm.assert_frame_equal(result, expected)

        concatted = pd.concat([df, df], keys=[0.8, 0.5])
        result = concatted.sort_index()

        assert result.index.is_lexsorted()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        # GH#14015
        df = DataFrame(
            [[1, 2], [6, 7]],
            columns=MultiIndex.from_tuples(
                [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")],
                names=["l1", "Date"],
            ),
        )

        df.columns = df.columns.set_levels(
            pd.to_datetime(df.columns.levels[1]), level=1
        )
        assert not df.columns.is_lexsorted()
        assert not df.columns.is_monotonic
        result = df.sort_index(axis=1)
        assert result.columns.is_lexsorted()
        assert result.columns.is_monotonic
        result = df.sort_index(axis=1, level=1)
        assert result.columns.is_lexsorted()
        assert result.columns.is_monotonic
Example #5
0
    def __init__(self,
                 data: pd.DataFrame,
                 strategy: Type[Strategy],
                 *,
                 cash: float = 10000,
                 commission: float = .0,
                 margin: float = 1.,
                 trade_on_close=False
                 ):
        """
        Initialize a backtest. Requires data and a strategy to test.

        `data` is a `pd.DataFrame` with columns:
        `Open`, `High`, `Low`, `Close`, and (optionally) `Volume`.
        If any columns are missing, set them to what you have available,
        e.g.

            df['Open'] = df['High'] = df['Low'] = df['Close']

        DataFrame index can be either datetime index (timestamps)
        or a monotonic range index (i.e. a sequence of periods).

        `strategy` is a `backtesting.backtesting.Strategy`
        _subclass_ (not an instance).

        `cash` is the initial cash to start with.

        `commission` is the commision ratio. E.g. if your broker's commission
        is 1% of trade value, set commission to `0.01`. Note, if you wish to
        account for bid-ask spread, you cam approximate doing so by increasing
        the commission, e.g. set it to `0.0002` for commission-less forex
        trading where the average spread is roughly 0.2‰ of asking price.

        `margin` is the required margin (ratio) of a leveraged account.
        No difference is made between initial and maintenance margins.
        To run the backtest using e.g. 50:1 leverge that your broker allows,
        set margin to `0.02` (1 / leverage).

        If `trade_on_close` is `True`, market orders will be executed
        with respect to the current bar's closing price instead of the
        next bar's open.
        """

        if not (isinstance(strategy, type) and issubclass(strategy, Strategy)):
            raise TypeError('`strategy` must be a Strategy sub-type')
        if not isinstance(data, pd.DataFrame):
            raise TypeError("`data` must be a pandas.DataFrame with columns")
        if not isinstance(commission, Number):
            raise TypeError('`commission` must be a float value, percent of '
                            'entry order price')

        data = data.copy(deep=False)

        # Convert index to datetime index
        if (not data.index.is_all_dates and
            not isinstance(data.index, pd.RangeIndex) and
            # Numeric index with most large numbers
            (data.index.is_numeric() and
             (data.index > pd.Timestamp('1975').timestamp()).mean() > .8)):
            try:
                data.index = pd.to_datetime(data.index, infer_datetime_format=True)
            except ValueError:
                pass

        if 'Volume' not in data:
            data['Volume'] = np.nan

        if len(data) == 0:
            raise ValueError('OHLC `data` is empty')
        if len(data.columns & {'Open', 'High', 'Low', 'Close', 'Volume'}) != 5:
            raise ValueError("`data` must be a pandas.DataFrame with columns "
                             "'Open', 'High', 'Low', 'Close', and (optionally) 'Volume'")
        if data[['Open', 'High', 'Low', 'Close']].isnull().values.any():
            raise ValueError('Some OHLC values are missing (NaN). '
                             'Please strip those lines with `df.dropna()` or '
                             'fill them in with `df.interpolate()` or whatever.')
        if not data.index.is_monotonic_increasing:
            warnings.warn('Data index is not sorted in ascending order. Sorting.',
                          stacklevel=2)
            data = data.sort_index()
        if not data.index.is_all_dates:
            warnings.warn('Data index is not datetime. Assuming simple periods.',
                          stacklevel=2)

        self._data = data   # type: pd.DataFrame
        self._broker = partial(
            _Broker, cash=cash, commission=commission, margin=margin,
            trade_on_close=trade_on_close, length=len(data)
        )
        self._strategy = strategy
        self._results = None
Example #6
0
def hampel_filter_with_dev_df(df: pd.DataFrame,
                              vals_col: str,
                              time_col=None,
                              win_size=30,
                              num_dev=3,
                              center_win=True) -> pd.DataFrame:
    """
    This function takes in dataframe containing time series of values, applies Hampel filter on
    these values, and returns dataframe consisting of original values columns along with
    the Hampel filtered data, outlier values, boolean flags where outliers found, values for lower
    deviation from median, values for upper deviation from median.

    Parameters
    ----------
    df: pd.DataFrame
        data from containing time series that needs to be Hampel filtered
    vals_col: str
        Single column name that contains values that need to be filtered.
    time_col: str
        Name of column that contains dates or timestamps
    win_size: int
        Size of sliding window for filtering.  Essentially the number of time steps to be considered when filtering.
    num_dev: int
        Number of standard deviations to consider when detecting values that would be considered outliers.
    center_win: Boolean
        Boolean value that determines whether the window is centered about the point being filtered?  Default=True.
        If False, point is at the leading edge (i.e. right side) of window  calculation.

    Returns
    -------
    Function returns a full dataframe consisting of original values columns along with
    the Hampel filtered data, outlier values and boolean flags where outliers found.
    """

    if (time_col != None):
        if (time_col not in list(df.columns)):
            raise Exception(
                "Timestamp column '{}' is missing!".format(time_col))
        elif (time_col in list(df.columns)):
            if (not np.issubdtype(df[time_col].dtype, np.datetime64)):
                if (not np.issubdtype(
                        pd.to_datetime(df[time_col]).dtype, np.datetime64)):
                    raise Exception(
                        "Timestamp column '{}' is not np.datetime64".format(
                            time_col))
                else:
                    df[time_col] = pd.to_datetime(df[time_col])
                    drop_cols = set(df.columns) - set([time_col, vals_col])
                    # Not really filtered at this point. Just naming appropriately ahead of time.
                    orig_vals = df.sort_values(
                        time_col, ascending=True).set_index(time_col).copy()
                    filtered = orig_vals.drop(columns=drop_cols).copy()
            else:
                df[time_col] = pd.to_datetime(df[time_col])
                drop_cols = set(df.columns) - set([time_col, vals_col])
                # Not really filtered at this point. Just naming appropriately ahead of time.
                orig_vals = df.sort_values(
                    time_col, ascending=True).set_index(time_col).copy()
                filtered = orig_vals.drop(columns=drop_cols).copy()

    elif (time_col == None):
        if (not isinstance(df.index, pd.DatetimeIndex)):
            raise Exception("DataFrame index is not pd.DatetimeIndex")
        else:
            df.sort_index(inplace=True)
            drop_cols = set(df.columns) - set([vals_col])
            orig_vals = df.copy()
            filtered = orig_vals.drop(columns=drop_cols).copy()

    # Scale factor for estimating standard deviation based upon median value
    L = 1.4826

    # Calculate rolling median for the series
    rolling_median = filtered.rolling(window=int(win_size),
                                      center=center_win,
                                      min_periods=1).median()

    # Define a lambda function to apply to the series to calculate Median Absolute Deviation
    MAD = lambda x: np.median(np.abs(x - np.median(x)))

    # Calculate rolling MAD series
    rolling_MAD = filtered.rolling(window=(win_size),
                                   center=center_win,
                                   min_periods=1).apply(MAD)

    # Calculate threshold level for filtering based upon the number of standard deviation and
    # constant scaling factor L.
    threshold = int(num_dev) * L * rolling_MAD

    # Difference between original values and rolling median
    # Again, "filtered" not yet filtered at this point.
    difference = np.abs(filtered - rolling_median)

    median_minus_threshold = rolling_median - threshold
    median_minus_threshold.rename(columns={vals_col: 'LOWER_DEV'},
                                  inplace=True)
    median_plus_threshold = rolling_median + threshold
    median_plus_threshold.rename(columns={vals_col: 'UPPER_DEV'}, inplace=True)
    '''
    # TODO: Look at logic here to possibly not mark as an outlier if threshold value
    is 0.0
    '''

    # Flag outliers
    outlier_idx = difference > threshold

    # Now it's filtered.  This should replace original values with filtered values from the rolling_median
    # dataframe where outliers were found.
    filtered[outlier_idx] = rolling_median[outlier_idx]
    filtered.rename(columns={vals_col: 'FLTRD_VAL'}, inplace=True)

    # Capture outliers column
    outliers = orig_vals[outlier_idx].rename(columns={
        vals_col: 'OUTLIER_VAL'
    }).drop(columns=drop_cols)
    # Capture outlier IS_OUTLIER column
    outlier_idx.rename(columns={vals_col: 'IS_OUTLIER'}, inplace=True)

    # The following returns a full dataframe consisting of original values columns
    # along with the Hampel filtered data, outlier values and boolean flags where outliers found.
    return pd.concat([
        orig_vals, filtered, outliers, outlier_idx, median_minus_threshold,
        median_plus_threshold
    ],
                     axis=1)
Example #7
0
obj = Series([2,3,5,6],index = ['d','c','b','a'])

# Series에서 인덱스번호를 기준으로 정렬하는 방법
obj.sort_index()
obj.sort_index(ascending = False)

# Series에서 값을 기준으로 정렬하는 방법
obj.sort_values())
obj.sort_values(ascending = False)


import numpy as np
df = DataFrame(np.arange(8).reshape(2,4),index = ['two','one'],columns = ['b','d','a','c'])

df.sort_index()
df.sort_index(ascending = False)

df.sort_index(axis= 1)
df.sort_index(ascending=False,axis = 1)

df.sort_values(by = 'b', axis = 0, ascending = False)
df.sort_values(by = 'one', axis = 1, ascending = False)

obj1 = Series([78,88,92,79,67,91,70,86,90,90])
obj1.sort_values()

obj1.rank()
obj1.rank(ascending = False)  #같은 값이 나와서 .5단위로 나온다.
obj1.rank(ascending = False,method='average')  #.5 단위시 그냥 .5
obj1.rank(ascending = False,method='min')  #.5 단위시 낮은값으로
Example #8
0
    def test_getitem_duplicates_multiindex(self):
        # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
        # the appropriate error, only in PY3 of course!

        index = MultiIndex(levels=[['D', 'B', 'C'],
                                   [0, 26, 27, 37, 57, 67, 75, 82]],
                           labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
                                   [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
                           names=['tag', 'day'])
        arr = np.random.randn(len(index), 1)
        df = DataFrame(arr, index=index, columns=['val'])
        result = df.val['D']
        expected = Series(arr.ravel()[0:3], name='val', index=Index(
            [26, 37, 57], name='day'))
        tm.assert_series_equal(result, expected)

        def f():
            df.val['A']

        pytest.raises(KeyError, f)

        def f():
            df.val['X']

        pytest.raises(KeyError, f)

        # A is treated as a special Timestamp
        index = MultiIndex(levels=[['A', 'B', 'C'],
                                   [0, 26, 27, 37, 57, 67, 75, 82]],
                           labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
                                   [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
                           names=['tag', 'day'])
        df = DataFrame(arr, index=index, columns=['val'])
        result = df.val['A']
        expected = Series(arr.ravel()[0:3], name='val', index=Index(
            [26, 37, 57], name='day'))
        tm.assert_series_equal(result, expected)

        def f():
            df.val['X']

        pytest.raises(KeyError, f)

        # GH 7866
        # multi-index slicing with missing indexers
        idx = MultiIndex.from_product([['A', 'B', 'C'],
                                       ['foo', 'bar', 'baz']],
                                      names=['one', 'two'])
        s = Series(np.arange(9, dtype='int64'), index=idx).sort_index()

        exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']],
                                          names=['one', 'two'])
        expected = Series(np.arange(3, dtype='int64'),
                          index=exp_idx).sort_index()

        result = s.loc[['A']]
        tm.assert_series_equal(result, expected)
        result = s.loc[['A', 'D']]
        tm.assert_series_equal(result, expected)

        # not any values found
        pytest.raises(KeyError, lambda: s.loc[['D']])

        # empty ok
        result = s.loc[[]]
        expected = s.iloc[[]]
        tm.assert_series_equal(result, expected)

        idx = pd.IndexSlice
        expected = Series([0, 3, 6], index=MultiIndex.from_product(
            [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index()

        result = s.loc[idx[:, ['foo']]]
        tm.assert_series_equal(result, expected)
        result = s.loc[idx[:, ['foo', 'bah']]]
        tm.assert_series_equal(result, expected)

        # GH 8737
        # empty indexer
        multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'],
                                               ['alpha', 'beta']))
        df = DataFrame(
            np.random.randn(5, 6), index=range(5), columns=multi_index)
        df = df.sort_index(level=0, axis=1)

        expected = DataFrame(index=range(5),
                             columns=multi_index.reindex([])[0])
        result1 = df.loc[:, ([], slice(None))]
        result2 = df.loc[:, (['foo'], [])]
        tm.assert_frame_equal(result1, expected)
        tm.assert_frame_equal(result2, expected)

        # regression from < 0.14.0
        # GH 7914
        df = DataFrame([[np.mean, np.median], ['mean', 'median']],
                       columns=MultiIndex.from_tuples([('functs', 'mean'),
                                                       ('functs', 'median')]),
                       index=['function', 'name'])
        result = df.loc['function', ('functs', 'mean')]
        assert result == np.mean
Example #9
0
    def test_per_axis_per_level_getitem(self):

        # GH6134
        # example test case
        ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl(
            'C', 4), _mklbl('D', 2)])
        df = DataFrame(np.arange(len(ix.get_values())), index=ix)

        result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :]
        expected = df.loc[[tuple([a, b, c, d])
                           for a, b, c, d in df.index.values
                           if (a == 'A1' or a == 'A2' or a == 'A3') and (
                               c == 'C1' or c == 'C3')]]
        tm.assert_frame_equal(result, expected)

        expected = df.loc[[tuple([a, b, c, d])
                           for a, b, c, d in df.index.values
                           if (a == 'A1' or a == 'A2' or a == 'A3') and (
                               c == 'C1' or c == 'C2' or c == 'C3')]]
        result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :]
        tm.assert_frame_equal(result, expected)

        # test multi-index slicing with per axis and per index controls
        index = MultiIndex.from_tuples([('A', 1), ('A', 2),
                                        ('A', 3), ('B', 1)],
                                       names=['one', 'two'])
        columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
                                          ('b', 'foo'), ('b', 'bah')],
                                         names=['lvl0', 'lvl1'])

        df = DataFrame(
            np.arange(16, dtype='int64').reshape(
                4, 4), index=index, columns=columns)
        df = df.sort_index(axis=0).sort_index(axis=1)

        # identity
        result = df.loc[(slice(None), slice(None)), :]
        tm.assert_frame_equal(result, df)
        result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))]
        tm.assert_frame_equal(result, df)
        result = df.loc[:, (slice(None), slice(None))]
        tm.assert_frame_equal(result, df)

        # index
        result = df.loc[(slice(None), [1]), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), 1), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        # columns
        result = df.loc[:, (slice(None), ['foo'])]
        expected = df.iloc[:, [1, 3]]
        tm.assert_frame_equal(result, expected)

        # both
        result = df.loc[(slice(None), 1), (slice(None), ['foo'])]
        expected = df.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.loc['A', 'a']
        expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]),
                             index=Index([1, 2, 3], name='two'),
                             columns=Index(['bar', 'foo'], name='lvl1'))
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), [1, 2]), :]
        expected = df.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # multi-level series
        s = Series(np.arange(len(ix.get_values())), index=ix)
        result = s.loc['A1':'A3', :, ['C1', 'C3']]
        expected = s.loc[[tuple([a, b, c, d])
                          for a, b, c, d in s.index.values
                          if (a == 'A1' or a == 'A2' or a == 'A3') and (
                              c == 'C1' or c == 'C3')]]
        tm.assert_series_equal(result, expected)

        # boolean indexers
        result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :]
        expected = df.iloc[[2, 3]]
        tm.assert_frame_equal(result, expected)

        def f():
            df.loc[(slice(None), np.array([True, False])), :]

        pytest.raises(ValueError, f)

        # ambiguous cases
        # these can be multiply interpreted (e.g. in this case
        # as df.loc[slice(None),[1]] as well
        pytest.raises(KeyError, lambda: df.loc[slice(None), [1]])

        result = df.loc[(slice(None), [1]), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        # not lexsorted
        assert df.index.lexsort_depth == 2
        df = df.sort_index(level=1, axis=0)
        assert df.index.lexsort_depth == 0
        with tm.assert_raises_regex(
                UnsortedIndexError,
                'MultiIndex slicing requires the index to be '
                r'lexsorted: slicing on levels \[1\], lexsort depth 0'):
            df.loc[(slice(None), slice('bar')), :]

        # GH 16734: not sorted, but no real slicing
        result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :]
        tm.assert_frame_equal(result, df.iloc[[1, 3], :])
Example #10
0
)
format = lambda x: '%.2f' % x

print(df['e'].map(format))

s1 = df['e'].map(format)
print(s1.sort_index())

df2 = DataFrame(
    np.arange(8).reshape(2, 4),
    columns=['d', 'a', 'b', 'c'],
    index=['three', 'one'],
)
print(df2)

print(df2.sort_index())  # row를 기준으로 정렬
print(df2.sort_index(axis=1))  # column 기준으로 정렬

# 데이터(색인과 값)는 기본적으로 오름차순으로 정렬이 된다.
# 내림차순으로 정렬을 할 때는 ascending=False 해준다.
print(df.sort_index(axis=1, ascending=False))

# 객체를 값에 따라 정렬할 경우에는 sort_values 메서드를 사용한다.
obj = Series([4, 7, -3, 1])
print(obj)
print(obj.sort_values())

obj2 = Series([4, np.nan, 8, np.nan, -10, 2])
print(obj2)
print(obj2.sort_values())  # NaN 값은 정렬시 가장 마지막에 위치한다.
Example #11
0
    def test_per_axis_per_level_setitem(self):

        # test index maker
        idx = pd.IndexSlice

        # test multi-index slicing with per axis and per index controls
        index = MultiIndex.from_tuples([('A', 1), ('A', 2),
                                        ('A', 3), ('B', 1)],
                                       names=['one', 'two'])
        columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
                                          ('b', 'foo'), ('b', 'bah')],
                                         names=['lvl0', 'lvl1'])

        df_orig = DataFrame(
            np.arange(16, dtype='int64').reshape(
                4, 4), index=index, columns=columns)
        df_orig = df_orig.sort_index(axis=0).sort_index(axis=1)

        # identity
        df = df_orig.copy()
        df.loc[(slice(None), slice(None)), :] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc(axis=0)[:, :] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[:, (slice(None), slice(None))] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        # index
        df = df_orig.copy()
        df.loc[(slice(None), [1]), :] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), 1), :] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc(axis=0)[:, 1] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # columns
        df = df_orig.copy()
        df.loc[:, (slice(None), ['foo'])] = 100
        expected = df_orig.copy()
        expected.iloc[:, [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # both
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[idx[:, 1], idx[:, ['foo']]] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc['A', 'a'] = 100
        expected = df_orig.copy()
        expected.iloc[0:3, 0:2] = 100
        tm.assert_frame_equal(df, expected)

        # setting with a list-like
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array(
            [[100, 100], [100, 100]], dtype='int64')
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # not enough values
        df = df_orig.copy()

        def f():
            df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array(
                [[100], [100, 100]], dtype='int64')

        pytest.raises(ValueError, f)

        def f():
            df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array(
                [100, 100, 100, 100], dtype='int64')

        pytest.raises(ValueError, f)

        # with an alignable rhs
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice(
            None), 1), (slice(None), ['foo'])] * 5
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice(
            None), 1), (slice(None), ['foo'])]
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(df, expected)

        rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy()
        rhs.loc[:, ('c', 'bah')] = 10
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(df, expected)
Example #12
0
 def test_changes_length_raises(self):
     df = DataFrame({"A": [1, 2, 3]})
     with pytest.raises(ValueError, match="change the shape"):
         df.sort_index(key=lambda x: x[:1])
Example #13
0
class predict(object):
    
    def __init__(self):
        self.__pwd = self.__getExePath()
        self.__predicted_path = self.__pwd +'output' + os.path.sep
        self.__predicted_picture_ok_path = self.__pwd +'output' + os.path.sep + 'picture' + os.path.sep + 'ok' + os.path.sep
        self.__predicted_picture_notok_path = self.__pwd +'output' + os.path.sep + 'picture' + os.path.sep + 'notok' + os.path.sep
        self.__data_basic_path = self.__pwd +'basic_data' + os.path.sep 
        self.__my_pred_file_workday = self.__predicted_path + 'predicted_last_workday.csv'
        self.__my_pred_file_holiday = self.__predicted_path + 'predicted_last_holiday.csv'
        self.zhibiao_need = DataFrame()
        

    
    def __getExePath(self):
        sap = '/'
        if sys.argv[0].find(sap) == -1:
            sap = '\\'
        indx = sys.argv[0].rfind(sap)
        path = sys.argv[0][:indx] + sap
        return path

    

    def __str2time_1(self, list_in):
        date_list_new = list()
        for time_id in list_in:
            date_new = datetime.datetime.strptime(time_id,'%Y/%m/%d/%H')
            date_new = date_new.strftime('%Y-%m-%d %H:%M:%S.000')
            date_list_new.append(date_new)
        return date_list_new

    def __date_list_1(self, zhibiao):
        d_year = zhibiao['year']
        d_month = zhibiao['month']
        d_day = zhibiao['day']
        d_hour = zhibiao['hour']
        date_len = len(d_year)
        date_haha = range(date_len)
        date_list1 = list(map(lambda i :str(d_year[i])+ '/' + str(d_month[i]) + '/'
                              + str(d_day[i])+ '/' + str(d_hour[i]) , date_haha))
        date_list_out = self.__str2time_1(date_list1)
        return date_list_out

    def __str2time(self, list_in):
        date_list_new = list()
        for time_id in list_in:
            date_new = datetime.datetime.strptime(time_id,'%Y/%m/%d')
            date_new = date_new.strftime('%Y-%m-%d %H:%M:%S.000')
            date_list_new.append(date_new)
        return date_list_new

    def __date_list(self, zhibiao):
        d_year = zhibiao['year']
        d_month = zhibiao['month']
        d_day = zhibiao['day']
        date_len = len(d_year)
        date_haha = range(date_len)
        date_list1 = list(map(lambda i :str(d_year[i])+ '/' + str(d_month[i]) + '/'
                              + str(d_day[i]) , date_haha))
        date_list_out = self.__str2time(date_list1)
        return date_list_out
  
    def __isworkday(self, data):
        date_thisday = data[0]
        if date_thisday in date_thisday:
            return True
        else:
           return False
        
    def __ratio_mark(self, data_input, standard):
        data_input_list = DataFrame(data_input, columns=['flow'])
        mark_1 = abs(data_input_list.where(data_input_list['flow']<standard)/standard)
        mark_out = mark_1.fillna(1)
        return mark_out
    

    def __del_file(self, path):
        ls = os.listdir(path)
        for i in ls:
            c_path = os.path.join(path, i)
            if os.path.isdir(c_path):
                self.__del_file(c_path)
            else:
                os.remove(c_path)

    def __not_ok(self, data_input, lastday_input, data_input_ratio, net_item, flow_standard, alert_standard):      
        ratio_need = data_input_ratio[['UE', 'erab', 'handover', 'rrc']]
        ratio_need_columns = ratio_need.columns
        ratio_out = DataFrame()
        for columns_need in ratio_need_columns:
            ratio_item = ratio_need[columns_need]
            ratio_isok = list(map(lambda y:0  if y>alert_standard else 1, ratio_item))
            ratio_out[columns_need] = ratio_isok
                
        flow_ratio = DataFrame(data_input_ratio['flow'],columns=['flow'])
        flow_ratio.index = range(len(flow_ratio))
        flow_data_pred = data_input['flow']    
        flow_data_pred.index = range(len(flow_data_pred))  
        last_flow = lastday_input['flow']    
        flow_mark = self.__ratio_mark(flow_data_pred, flow_standard)# 超过3G则不考虑比例转换
        flow_ratio_out = DataFrame(np.array(flow_ratio)*np.array(flow_mark), columns=['flow'])
        flow_ratio_out_series = flow_ratio_out['flow']    
        flow_isok = list(map(lambda y:0  if y>alert_standard else 1, flow_ratio_out_series))
        ratio_out['flow'] = flow_isok
    
        notok_hours = 24 - ratio_out.sum(axis=0)
        if notok_hours['flow'] > 3:
            fig = plt.figure(figsize=(6, 3))
            ax = fig.add_subplot(111)
            ax.plot(range(24), flow_data_pred, label="flow_pred", color="g")
            ax.plot(range(24), last_flow, label="flow_lastday", color="r")
            ax.set_ylabel('GB')
            ax.set_xlabel('Hour')
            plt.legend(loc="upper left")
            plt.title('net_num: ' + net_item + '  flow lastday')
            plt.savefig(self.__predicted_picture_notok_path + net_item +'.png')    
        else:        
            fig = plt.figure(figsize=(6, 3))
            ax = fig.add_subplot(111)
            ax.plot(range(24), flow_data_pred, label="flow_pred", color="g")
            ax.plot(range(24), last_flow, label="flow_lastday", color="r")
            ax.set_ylabel('GB')
            ax.set_xlabel('Hour')
            plt.legend(loc="upper left")
            plt.title('net_num: ' + net_item + '  flow lastday')
            plt.savefig(self.__predicted_picture_ok_path + net_item +'.png')      
        return ratio_out
    
    @jit
    def tensor_flow_go(self, net_list):
        predicted_last = pd.DataFrame(columns=['UE', 'erab', 'flow', 'handover', 
                                               'rrc', 'net_num'])
        for cell in net_list: 
            y_zhunbei = self.zhibiao_need[self.zhibiao_need.net_num == cell][['time',
                                    'UE', 'erab', 'flow', 'handover', 'rrc']]
            y = numpy.array(y_zhunbei[['UE', 'erab', 'flow', 'handover', 
                                 'rrc']])
            x = np.array(range(len(y)))
      
            data = {tf.contrib.timeseries.TrainEvalFeatures.TIMES: x,
                    tf.contrib.timeseries.TrainEvalFeatures.VALUES: y,}
            reader = NumpyReader(data)
      
            train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(reader, batch_size=10, 
                                                                       window_size=48)
            ar = tf.contrib.timeseries.ARRegressor(
                    periodicities=48, input_window_size=24, output_window_size=24,
                    num_features=5, loss=tf.contrib.timeseries.ARModel.NORMAL_LIKELIHOOD_LOSS)
    
            ar.train(input_fn=train_input_fn, steps=700)
    
            evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
    # keys of evaluation: ['covariance', 'loss', 'mean', 'observed', 'start_tuple', 'times', 'global_step']
            evaluation = ar.evaluate(input_fn=evaluation_input_fn, steps=1)
            (predictions,) = tuple(ar.predict(
                    input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
                            evaluation, steps=24)))
            prediction = predictions['mean']
      
            predicted_out = DataFrame(prediction)
            predicted_out.columns = ['UE', 'erab', 'flow', 'handover', 'rrc']      
            predicted_out['net_num'] = cell
            predicted_last = predicted_last.append(predicted_out)


    def predicter_tenorflow(self):
        tf.logging.set_verbosity(tf.logging.INFO)
        conn = pymysql.connect("localhost", "root", "1234", "python_test", charset='utf8' )
        cursor = conn.cursor()
        cursor.execute("select * from python_test.2018workday")
        canlender_2018 = cursor.fetchall()
        workday_calender = DataFrame(list(canlender_2018), columns=['date'])
        cursor.close
        conn.close  
        workday_2018 = self.__str2time(list(workday_calender['date']))   
        #zhibiao_oneday = pd.read_csv(self.__pwd + 'today.csv')
        zhibiao_oneday = pd.read_csv('F:/work/pyqt/predicter/today.csv')
        
        data_1 = self.__date_list(zhibiao_oneday)
        is_workday = self.__isworkday(data_1) 
        print(is_workday)
               
        conn = pymysql.connect("localhost", "root", "1234", "python_test", charset='utf8' )
        cursor = conn.cursor()
        engine = create_engine('mysql+pymysql://root:[email protected]:3306/python_test?charset=utf8')
        if is_workday:
            #从数据库读取workday
            cursor.execute("select * from python_test.workday")
            work_day = cursor.fetchall()
            zhibiao = DataFrame(list(work_day), columns=['net_num', 'year', 
                                'month', 'day','hour', 'UE', 'erab', 'flow',
                                'handover', 'rrc'])
            cursor.execute("drop table python_test.workday")
            zhibiao1 = zhibiao.append(zhibiao_oneday)
            zhibiao2 = zhibiao1.drop_duplicates(['net_num', 'year', 
                                'month', 'day','hour']) 
            zhibiao2.to_sql('workday',con=engine, schema='python_test', index=False,
                            index_label=False, if_exists='append', chunksize=1000)
            conn.commit()

        else:
            #从数据库读取holiday
            cursor.execute("select * from python_test.holiday")
            holi_day = cursor.fetchall()
            zhibiao = DataFrame(list(holi_day), columns=['net_num', 'year', 
                                'month', 'day','hour', 'UE', 'erab', 'flow',
                                'handover', 'rrc'])
            cursor.execute("drop table python_test.holiday")
            zhibiao1 = zhibiao.append(zhibiao_oneday)
            zhibiao2 = zhibiao1.drop_duplicates(['net_num', 'year', 
                                'month', 'day','hour']) 
            zhibiao2.to_sql('holiday',con=engine, schema='python_test', index=False,
                            index_label=False, if_exists='append', chunksize=1000)
            conn.commit()
        cursor.close()
        conn.close()
    
    
        date_index = self.__date_list_1(zhibiao)
        zhibiao.index = date_index 
        zhibiao['time'] = date_index   

        #zhibiao1.to_csv("F:/work/tianhe4location/test/wori.csv")   
  
        date_index_list = sorted(list(set(list(date_index))), reverse=True)
        date_len = len(date_index_list)/24  
        if date_len <= 30:
            zhibiao_need_index = date_index_list
        else:
            zhibiao_need_index = date_index_list[:60*24]
          
        self.zhibiao_need = zhibiao.ix[zhibiao_need_index]
        self.zhibiao_need = self.zhibiao_need.sort_index()
    
        net_name = set(list(self.zhibiao_need.net_num))
        net_name_all = list(net_name)
        #print(net_name_all)
        net_num_perg = round(len(net_name)/8)
        
        net_list_index = list()
        for i in range(8):
            if i == 7:
                net_list_i = net_name_all[i*net_num_perg:len(net_name)-1]
            else:
                net_list_i = net_name_all[i*net_num_perg:(i+1)*net_num_perg]
            net_list_index.append(net_list_i)
        
        predicted_all_last = pd.DataFrame(columns=['UE', 'erab', 'flow', 'handover', 
                                               'rrc', 'net_num'])
                
        #print(net_list_index)
        
        start = time.time()
        
        for net_list_index_processing in net_list_index:   
            self.tensor_flow_go(net_list_index_processing)
            
        end = time.time()
        
        print(end-start)
        #xx =net_list_index[0]
        #self.tensor_flow_go(xx)
        
       
        
        
        
        
        
        
        
        
Example #14
0
print(fund_trade)
#取出fund_trade中三列,分别为:交易日期,基金代码,收盘价,并且以交易代码为索引。
F_CLOSE=DataFrame(fund_trade[['F_TRADE_DATE','F_FUND_CODE','F_CLOSE']])
FC=F_CLOSE

#布尔型索引,根据基金代码列是否为制定基金代码,选择该基金的全部收盘价
codelist=(165516,161714)
FC[FC['F_FUND_CODE']==codelist]
F_CLOSE_CHOOSEN=DataFrame(FC[FC['F_FUND_CODE']==code
                                  for code in codelist)
FCC=F_CLOSE_CHOOSEN
FC
###20-23 想获取两只基金的历史收盘价????
print(FC)
for code in codelist:
     print(code)
F_CLOSE_CHOOSEN={}
for code in codelist:
          F_CLOSE_CHOOSEN=DataFrame(F_CLOSE[F_CLOSE['F_FUND_CODE']==code])
A=F_CLOSE_CHOOSEN.sort_index(by='F_TRADE_DATE')
A
F_CLOSE_CHOOSEN
#根据交易日期列进行排序
F_CLOSE_CHOOSEN.sort_index(by='F_TRADE_DATE')
F_CLOSE['F_FUND_CODE'==165516]
F_CLOSE.ix[510220]
CODE_LIST=(510220,165516)
for code in CODE_LIST:
    print(code)
F_CLOSE_CHOOSE=DataFrame(F_CLOSE[F_CLOSE['F_FUND_CODE']==[code for code in CODE_LIST]])
Example #15
0
#Create new row
empDf.append(Series([5, False, 'Derek', 2],
                    index=['id', 'isManager', 'name', 'deptId']),
             ignore_index=True)

#Delete a column
empDf['dummy'] = 1
empDf
del empDf['dummy']
empDf

#Delete a row
empDf.drop(1)

#Sort a Data Frame
empDf.sort_index(axis=1)
empDf.sort(['isManager', 'name'])

empDf.describe()
empDf.id.corr(empDf.deptId)

#Iterate through a DataFrame
for rowNum, row in auto_data.iterrows():
    for colName, col in row.iteritems():
        #if  pd.isnull(col) :
        print(pd.isnull(col), rowNum, colName)

#----------------------------------------------------------------------------
#                   Data Operations
#----------------------------------------------------------------------------
Example #16
0
    def test_timegrouper_with_reg_groups(self):

        # GH 3794
        # allow combinateion of timegrouper/reg groups

        df_original = DataFrame({
            'Branch': 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 1, 1, 13, 0),
                datetime(2013, 1, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 12, 2, 12, 0),
                datetime(2013, 12, 2, 14, 0),
            ]
        }).set_index('Date')

        df_sorted = df_original.sort_values(by='Quantity', ascending=False)

        for df in [df_original, df_sorted]:
            expected = DataFrame({
                'Buyer': 'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 12, 31, 0, 0),
                    datetime(2013, 12, 31, 0, 0),
                    datetime(2013, 12, 31, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])

            result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

            expected = DataFrame({
                'Buyer': 'Carl Mark Carl Joe'.split(),
                'Quantity': [1, 3, 9, 18],
                'Date': [
                    datetime(2013, 1, 1, 0, 0),
                    datetime(2013, 1, 1, 0, 0),
                    datetime(2013, 7, 1, 0, 0),
                    datetime(2013, 7, 1, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

        df_original = DataFrame({
            'Branch': 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 10, 1, 13, 0),
                datetime(2013, 10, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 2, 12, 0),
                datetime(2013, 10, 2, 14, 0),
            ]
        }).set_index('Date')

        df_sorted = df_original.sort_values(by='Quantity', ascending=False)
        for df in [df_original, df_sorted]:

            expected = DataFrame({
                'Buyer': 'Carl Joe Mark Carl Joe'.split(),
                'Quantity': [6, 8, 3, 4, 10],
                'Date': [
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 2, 0, 0),
                    datetime(2013, 10, 2, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])

            result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
            expected = DataFrame({
                'Buyer': 'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 10, 31, 0, 0),
                    datetime(2013, 10, 31, 0, 0),
                    datetime(2013, 10, 31, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            assert_frame_equal(result, expected)

            # passing the name
            df = df.reset_index()
            result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
                                 ]).sum()
            assert_frame_equal(result, expected)

            with pytest.raises(KeyError):
                df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()

            # passing the level
            df = df.set_index('Date')
            result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
                                 ]).sum()
            assert_frame_equal(result, expected)
            result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
            )
            assert_frame_equal(result, expected)

            with pytest.raises(ValueError):
                df.groupby([pd.Grouper(freq='1M', level='foo'),
                            'Buyer']).sum()

            # multi names
            df = df.copy()
            df['Date'] = df.index + pd.offsets.MonthEnd(2)
            result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
                                 ]).sum()
            expected = DataFrame({
                'Buyer': 'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 11, 30, 0, 0),
                    datetime(2013, 11, 30, 0, 0),
                    datetime(2013, 11, 30, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            assert_frame_equal(result, expected)

            # error as we have both a level and a name!
            with pytest.raises(ValueError):
                df.groupby([pd.Grouper(freq='1M', key='Date',
                                       level='Date'), 'Buyer']).sum()

            # single groupers
            expected = DataFrame({'Quantity': [31],
                                  'Date': [datetime(2013, 10, 31, 0, 0)
                                           ]}).set_index('Date')
            result = df.groupby(pd.Grouper(freq='1M')).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M')]).sum()
            assert_frame_equal(result, expected)

            expected = DataFrame({'Quantity': [31],
                                  'Date': [datetime(2013, 11, 30, 0, 0)
                                           ]}).set_index('Date')
            result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
            assert_frame_equal(result, expected)

        # GH 6764 multiple grouping with/without sort
        df = DataFrame({
            'date': pd.to_datetime([
                '20121002', '20121007', '20130130', '20130202', '20130305',
                '20121002', '20121207', '20130130', '20130202', '20130305',
                '20130202', '20130305'
            ]),
            'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
                           359, 801],
            'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
        }).set_index('date')

        for freq in ['D', 'M', 'A', 'Q-APR']:
            expected = df.groupby('user_id')[
                'whole_cost'].resample(
                    freq).sum().dropna().reorder_levels(
                        ['date', 'user_id']).sort_index().astype('int64')
            expected.name = 'whole_cost'

            result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
                                               'user_id'])['whole_cost'].sum()
            assert_series_equal(result1, expected)

            result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
                'whole_cost'].sum()
            assert_series_equal(result2, expected)
Example #17
0
    def test_sort_values_key_empty(self, sort_by_key):
        df = DataFrame(np.array([]))

        df.sort_values(0, key=sort_by_key)
        df.sort_index(key=sort_by_key)
#trade_type.head(20).iloc[:,:5]

#trade_type=pd.read_pickle(data_path+'/trade_type_16_100w')
tmp = trade_type.copy()
tmp.index.names = ['trddt', 'trader', 'action']
#tmp=tmp.groupby(by='trddt').apply(lambda x:x/x.sum());tmp
#tmp.index.names=['stkcd']
select_stock = DataFrame(select_stock)
select_stock.columns.get_level_values(0)  #iloc[:,0].name[0]
Group_By = DataFrame(np.nan,
                     index=tmp.index,
                     columns=select_stock.columns.get_level_values(0))  #
for c in Group_By.columns:
    Group_By[c] = tmp[select_stock[(c, '2015-02-25')]].mean(axis=1)

G1 = Group_By.sort_index().loc(axis=0)[:'20150217', :, :]
G1
G2 = Group_By.sort_index().loc(axis=0)['20150225':, :, :]
G2
G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy']
G2.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy']
G2.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] - G1.groupby(
    by=['trader', 'action']).mean().loc(axis=0)[:, 'buy']
(G2.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'] -
 G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy']
 ) / G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy']

G1.groupby(by=['trader', 'action']).mean().loc(axis=0)[:, 'buy'].reset_index(
    level=1, drop=True) - G1.groupby(by=['trader', 'action']).mean().loc(
        axis=0)[:, 'sell'].reset_index(level=1, drop=True)
G1.groupby(by=['trader']).head(
Example #19
0
    def test_increment_without_metadata_with_schema(
        self,
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker",
        schema_file: Optional[LocalPath],
        verbose: bool,
    ):
        # List of (expected frame filename, data filename) tuples
        targets: List[Tuple[str, str]] = [
            ("iris-part-1-of-6-combined.csv", "iris-part-1-of-6-combined.csv"),
            ("iris-part-1-2.csv", "iris-part-2-of-6-combined.csv"),
            ("iris-part-1-2-3.csv", "iris-part-3-of-6-combined.csv"),
            ("iris-part-1-2-3-4.csv", "iris-part-4-of-6-combined.csv"),
            ("iris-part-1-2-3-4-5.csv", "iris-part-5-of-6-combined.csv"),
            ("iris_plus.csv", "iris-part-6-of-6-combined.csv"),
        ]

        expected_hashfile = (
            LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if
            archive_fixture.hash_file is None else archive_fixture.hash_file)
        assert not os.path.exists(expected_hashfile)
        assert not os.path.exists(archive_fixture.cache_file)
        assert len(archive_dir.listdir()) == 0

        expected_schemafile = (archive_dir.join(syphon.schema.DEFAULT_FILE)
                               if schema_file is None else schema_file)
        assert not os.path.exists(expected_schemafile)
        syphon.init(SortedDict({
            "0": "PetalColor",
            "1": "Species"
        }), expected_schemafile)
        assert os.path.exists(expected_schemafile)

        for expected_frame_filename, data_filename in targets:
            assert archive_fixture(
                archive_dir,
                [os.path.join(get_data_path(), data_filename)],
                schema_filepath=schema_file,
                cache_filepath=archive_fixture.cache_file,
                hash_filepath=archive_fixture.hash_file,
                verbose=verbose,
            )
            assert_captured_outerr(capsys.readouterr(), verbose, False)

            expected_frame = DataFrame(
                read_csv(
                    os.path.join(get_data_path(), expected_frame_filename),
                    dtype=str,
                    index_col="Index",
                ))
            expected_frame.sort_index(inplace=True)
            actual_frame = DataFrame(
                read_csv(str(archive_fixture.cache_file),
                         dtype=str,
                         index_col="Index"))
            actual_frame.sort_index(inplace=True)
            assert_captured_outerr(capsys.readouterr(), False, False)

            assert_frame_equal(expected_frame, actual_frame)
            assert os.path.exists(expected_hashfile)
            assert syphon.check(
                archive_fixture.cache_file,
                hash_filepath=expected_hashfile,
                verbose=verbose,
            )
Example #20
0
                                          ret_versus_prob,
                                          axis=0)
        sim += 1
        elapsed_time = time.time() - start_time
        print('Tempo de simulação:', elapsed_time)

    return performance_modelo, ret_medio_ibov_sim, ret_medio_port_sim, ret_medio_port_long_sim, ret_medio_port_short_sim, corr_prob_ret_sim, datas_teste_sim


# carrega do IBOVESPA e dados históricos
compomentes = ler_base_componetes()
base_total = carrega_dados()
datas = DataFrame(base_total['data'].drop_duplicates().values,
                  columns=['data'])
datas = datas.set_index(['data'])
datas = datas.sort_index(axis=0)
limit_inf = '19990202 18:00:000'
limit_sup = '20171230 18:00:000'
datas = datas.loc[limit_inf:limit_sup]
datas = datas.sort_index(axis=0)
datas = datas.reset_index(['data'])

# lista de variáveis para o modelo (features) e para aplicação de logaritmo (cols)
#features = ['data', 'codigo', 'retorno', 'acao_close', 'roe', 'pl', 'irf', 'sharpe', 'petroleo_close', 'dolar_close','dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close']
#cols = ['acao_close', 'petroleo_close', 'dolar_close', 'dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close']
features = [
    'data', 'codigo', 'retorno', 'acao_close', 'roe', 'sharpe', 'dolar_close',
    'sp500_close', 'ibov_fut_close'
]
cols = ['acao_close', 'dolar_close', 'sp500_close', 'ibov_fut_close']
Example #21
0
sort_Ser = unsort_Ser.sort_index()
print(sort_Ser)
print(sort_Ser.sort_index(ascending=False))

# 当然也可以按值进行排序
print(unsort_Ser.sort_values())
print(unsort_Ser.sort_values(ascending=False))

# 对于DataFrame数据结构,可以指定某个列或者某些列为index,继而对整个DataFrame进行排序
# 我们再回忆下DataFrame的复杂字典构造方法
temp_df = DataFrame({
    'Stu_name': ['Justin', 'Hux', 'Jacob', 'Steve'],
    'Math_result': [99, 88, 100, 20]
})
print(temp_df.sort_index(by='Math_result', ascending=False))

# 我们实验下能否使用sort_values()
print(temp_df.sort_values('Math_result', ascending=False))
# 我们发现二者的功能是相同的

# 那么能否同时按照两排的元素进行复合排序呢
print(temp_df.sort_index(by=['Math_result', 'Stu_name'], ascending=False))
# 但是当我们采用sort_value进行复合排序时就有一些捉襟见肘

# 在介绍完排序后,我们介绍下排名,即求一个Series中元素的秩
temp_Ser = Series([23, 12, -90, 24, 32, 7, 13])
print(temp_Ser.rank())
# 我们将值和秩排到一起
rank = temp_Ser.rank()
temp_df = DataFrame({'Value': temp_Ser.item, 'Rank': rank})
Example #22
0
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)

#Sorting and ranking
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])

frame.sort_index()
frame.sort_index(axis=1)

obj = Series([4, 7, -3, 2])
obj.order()

obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.order()

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_index(by='b')

#ranking
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
Example #23
0
    def test_sort_index_nan_multiindex(self):
        # GH#14784
        # incorrect sorting w.r.t. nans
        tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
        mi = MultiIndex.from_tuples(tuples)

        df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD"))
        s = Series(np.arange(4), index=mi)

        df2 = DataFrame(
            {
                "date": pd.DatetimeIndex(
                    [
                        "20121002",
                        "20121007",
                        "20130130",
                        "20130202",
                        "20130305",
                        "20121002",
                        "20121207",
                        "20130130",
                        "20130202",
                        "20130305",
                        "20130202",
                        "20130305",
                    ]
                ),
                "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
                "whole_cost": [
                    1790,
                    np.nan,
                    280,
                    259,
                    np.nan,
                    623,
                    90,
                    312,
                    np.nan,
                    301,
                    359,
                    801,
                ],
                "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
            }
        ).set_index(["date", "user_id"])

        # sorting frame, default nan position is last
        result = df.sort_index()
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position last
        result = df.sort_index(na_position="last")
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position first
        result = df.sort_index(na_position="first")
        expected = df.iloc[[1, 2, 3, 0], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame with removed rows
        result = df2.dropna().sort_index()
        expected = df2.sort_index().dropna()
        tm.assert_frame_equal(result, expected)

        # sorting series, default nan position is last
        result = s.sort_index()
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position last
        result = s.sort_index(na_position="last")
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position first
        result = s.sort_index(na_position="first")
        expected = s.iloc[[1, 2, 3, 0]]
        tm.assert_series_equal(result, expected)
Example #24
0
def _write_glue_preds(
    task_name: str,
    preds_df: pd.DataFrame,
    pred_dir: str,
    split_name: str,
    strict_glue_format: bool = False,
):
    """ Write predictions to separate files located in pred_dir.
    We write special code to handle various GLUE tasks.

    Use strict_glue_format to guarantee compatibility with GLUE website.

    Args:
        task_name: task name
        preds_df: predictions DataFrame for a single task, as returned by
            evaluate().
        pred_dir: directory to write predictions
        split_name: name of this split ('train', 'val', or 'test')
        strict_glue_format: if true, writes format compatible with GLUE
            website.
    """
    def _apply_pred_map(preds_df, pred_map, key="prediction"):
        """ Apply preds_map, in-place. """
        preds_df[key] = [pred_map[p] for p in preds_df[key]]

    def _write_preds_with_pd(preds_df: pd.DataFrame,
                             pred_file: str,
                             write_type=int):
        """ Write TSV file in GLUE format, using Pandas. """

        required_cols = ["index", "prediction"]
        if strict_glue_format:
            cols_to_write = required_cols
            quoting = QUOTE_NONE
            log.info(
                "Task '%s', split '%s': writing %s in "
                "strict GLUE format.",
                task_name,
                split_name,
                pred_file,
            )
        else:
            all_cols = set(preds_df.columns)
            # make sure we write index and prediction as first columns,
            # then all the other ones we can find.
            cols_to_write = required_cols + sorted(
                list(all_cols.difference(required_cols)))
            quoting = QUOTE_MINIMAL
        preds_df.to_csv(
            pred_file,
            sep="\t",
            index=False,
            float_format="%.3f",
            quoting=quoting,
            columns=cols_to_write,
        )

    if len(preds_df) == 0:  # catch empty lists
        log.warning("Task '%s': predictions are empty!", task_name)
        return

    def _add_default_column(df, name: str, val):
        """ Ensure column exists and missing values = val. """
        if name not in df:
            df[name] = val
        df[name].fillna(value=val, inplace=True)

    preds_df = preds_df.copy()
    _add_default_column(preds_df, "idx", -1)
    _add_default_column(preds_df, "sent1_str", "")
    _add_default_column(preds_df, "sent2_str", "")
    _add_default_column(preds_df, "labels", -1)
    # Rename columns to match output headers.
    preds_df.rename(
        {
            "idx": "index",
            "preds": "prediction",
            "sent1_str": "sentence_1",
            "sent2_str": "sentence_2",
            "labels": "true_label",
        },
        axis="columns",
        inplace=True,
    )

    if task_name == "mnli" and split_name == "test":  # 9796 + 9847 = 19643
        assert len(preds_df) == 19643, "Missing predictions for MNLI!"
        log.info("There are %d examples in MNLI, 19643 were expected",
                 len(preds_df))
        # Sort back to original order to split matched and mismatched, which are
        # treated as a single dataset by jiant.
        preds_df.sort_index(inplace=True)
        pred_map = {0: "neutral", 1: "entailment", 2: "contradiction"}
        _apply_pred_map(preds_df, pred_map, "prediction")
        _write_preds_with_pd(
            preds_df.iloc[:9796],
            _get_pred_filename("mnli-m", pred_dir, split_name,
                               strict_glue_format),
        )
        _write_preds_with_pd(
            preds_df.iloc[9796:],
            _get_pred_filename("mnli-mm", pred_dir, split_name,
                               strict_glue_format),
        )
    elif task_name in ["rte", "qnli"]:
        pred_map = {0: "not_entailment", 1: "entailment"}
        _apply_pred_map(preds_df, pred_map, "prediction")
        _write_preds_with_pd(
            preds_df,
            _get_pred_filename(task_name, pred_dir, split_name,
                               strict_glue_format))
    elif task_name in ["sts-b"]:
        preds_df["prediction"] = [
            min(max(0.0, pred * 5.0), 5.0) for pred in preds_df["prediction"]
        ]
        _write_preds_with_pd(
            preds_df,
            _get_pred_filename(task_name, pred_dir, split_name,
                               strict_glue_format),
            write_type=float,
        )
    elif task_name in ["wmt"]:
        # convert each prediction to a single string if we find a list of
        # tokens
        if isinstance(preds_df["prediction"][0], list):
            assert isinstance(preds_df["prediction"][0][0], str)
            preds_df["prediction"] = [
                " ".join(pred) for pred in preds_df["prediction"]
            ]
        _write_preds_with_pd(
            preds_df,
            _get_pred_filename(task_name, pred_dir, split_name,
                               strict_glue_format),
            write_type=str,
        )
    else:
        _write_preds_with_pd(
            preds_df,
            _get_pred_filename(task_name, pred_dir, split_name,
                               strict_glue_format),
            write_type=int,
        )

    log.info("Wrote predictions for task: %s", task_name)
Example #25
0
frame.apply(f, axis=1)

# Aplicar la función al DataFrame completo --------------------------------------------

format = lambda x: '%.2f' % x
frame.applymap(format)
frame.applymap(lambda x: x+10)

# Ordenacion --------------------------------------------

obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
obj.sort_index(ascending=False)

frame = DataFrame(np.arange(12).reshape((3, 4)),index=['three', 'one', 'two'], columns=['d', 'a', 'b', 'c'])
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_index(axis=1, ascending=False)
frame.sort_index(by='b', ascending=False)


# Información resumida y estadística --------------------------------------------
# Por defecto los valores NaN son omitidos en los calculos

df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan,np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df
df.sum()
df.sum(axis=1)
df.mean(axis=1)
df.mean(axis=1, skipna=False) # Las filas con algún NaN darán como suma NaN
Example #26
0
    def test_sort_index_duplicates(self):

        # with 9816, these are all translated to .sort_values

        df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"])

        with pytest.raises(ValueError, match="not unique"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by="a")
        with pytest.raises(ValueError, match="not unique"):
            df.sort_values(by="a")

        with pytest.raises(ValueError, match="not unique"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by=["a"])
        with pytest.raises(ValueError, match="not unique"):
            df.sort_values(by=["a"])

        with pytest.raises(ValueError, match="not unique"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                # multi-column 'by' is separate codepath
                df.sort_index(by=["a", "b"])
        with pytest.raises(ValueError, match="not unique"):
            # multi-column 'by' is separate codepath
            df.sort_values(by=["a", "b"])

        # with multi-index
        # GH4370
        df = DataFrame(np.random.randn(4, 2),
                       columns=MultiIndex.from_tuples([("a", 0), ("a", 1)]))
        with pytest.raises(ValueError, match="level"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by="a")
        with pytest.raises(ValueError, match="level"):
            df.sort_values(by="a")

        # convert tuples to a list of tuples
        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=[("a", 1)])
        expected = df.sort_values(by=[("a", 1)])

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=("a", 1))
        result = df.sort_values(by=("a", 1))
        assert_frame_equal(result, expected)
Example #27
0
    def test_per_axis_per_level_doc_examples(self):

        # test index maker
        idx = pd.IndexSlice

        # from indexing.rst / advanced
        index = MultiIndex.from_product(
            [_mklbl("A", 4),
             _mklbl("B", 2),
             _mklbl("C", 4),
             _mklbl("D", 2)])
        columns = MultiIndex.from_tuples(
            [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
            names=["lvl0", "lvl1"],
        )
        df = DataFrame(
            np.arange(len(index) * len(columns), dtype="int64").reshape(
                (len(index), len(columns))),
            index=index,
            columns=columns,
        )
        result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values if (
            a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]]
        tm.assert_frame_equal(result, expected)
        result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :]
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :]
        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values if (c == "C1" or c == "C3")]]
        tm.assert_frame_equal(result, expected)
        result = df.loc[idx[:, :, ["C1", "C3"]], :]
        tm.assert_frame_equal(result, expected)

        # not sorted
        msg = ("MultiIndex slicing requires the index to be lexsorted: "
               r"slicing on levels \[1\], lexsort depth 1")
        with pytest.raises(UnsortedIndexError, match=msg):
            df.loc["A1", ("a", slice("foo"))]

        # GH 16734: not sorted, but no real slicing
        tm.assert_frame_equal(df.loc["A1", (slice(None), "foo")],
                              df.loc["A1"].iloc[:, [0, 2]])

        df = df.sort_index(axis=1)

        # slicing
        df.loc["A1", (slice(None), "foo")]
        df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")]

        # setitem
        df.loc(axis=0)[:, :, ["C1", "C3"]] = -10
Example #28
0
 def aggr(func):
     f = lambda a: np.fromiter(map(func, a), dtype='f8')
     arr = np.vstack((f(jim.values()), f(joe.values()))).T
     res = DataFrame(arr, columns=['jim', 'joe'], index=mi)
     return res.sort_index()
Example #29
0
    def test_sort_nan(self):
        # GH3917
        nan = np.nan
        df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
                        'B': [9, nan, 5, 2, 5, 4, 5]})

        # sort one column only
        expected = DataFrame(
            {'A': [nan, 1, 1, 2, 4, 6, 8],
             'B': [5, 9, 2, nan, 5, 5, 4]},
            index=[2, 0, 3, 1, 6, 4, 5])
        sorted_df = df.sort_values(['A'], na_position='first')
        assert_frame_equal(sorted_df, expected)

        expected = DataFrame(
            {'A': [nan, 8, 6, 4, 2, 1, 1],
             'B': [5, 4, 5, 5, nan, 9, 2]},
            index=[2, 5, 4, 6, 1, 0, 3])
        sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
        assert_frame_equal(sorted_df, expected)

        expected = df.reindex(columns=['B', 'A'])
        sorted_df = df.sort_values(by=1, axis=1, na_position='first')
        assert_frame_equal(sorted_df, expected)

        # na_position='last', order
        expected = DataFrame(
            {'A': [1, 1, 2, 4, 6, 8, nan],
             'B': [2, 9, nan, 5, 5, 4, 5]},
            index=[3, 0, 1, 6, 4, 5, 2])
        sorted_df = df.sort_values(['A', 'B'])
        assert_frame_equal(sorted_df, expected)

        # na_position='first', order
        expected = DataFrame(
            {'A': [nan, 1, 1, 2, 4, 6, 8],
             'B': [5, 2, 9, nan, 5, 5, 4]},
            index=[2, 3, 0, 1, 6, 4, 5])
        sorted_df = df.sort_values(['A', 'B'], na_position='first')
        assert_frame_equal(sorted_df, expected)

        # na_position='first', not order
        expected = DataFrame(
            {'A': [nan, 1, 1, 2, 4, 6, 8],
             'B': [5, 9, 2, nan, 5, 5, 4]},
            index=[2, 0, 3, 1, 6, 4, 5])
        sorted_df = df.sort_values(['A', 'B'], ascending=[
                                   1, 0], na_position='first')
        assert_frame_equal(sorted_df, expected)

        # na_position='last', not order
        expected = DataFrame(
            {'A': [8, 6, 4, 2, 1, 1, nan],
             'B': [4, 5, 5, nan, 2, 9, 5]},
            index=[5, 4, 6, 1, 3, 0, 2])
        sorted_df = df.sort_values(['A', 'B'], ascending=[
                                   0, 1], na_position='last')
        assert_frame_equal(sorted_df, expected)

        # Test DataFrame with nan label
        df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
                        'B': [9, nan, 5, 2, 5, 4, 5]},
                       index=[1, 2, 3, 4, 5, 6, nan])

        # NaN label, ascending=True, na_position='last'
        sorted_df = df.sort_index(
            kind='quicksort', ascending=True, na_position='last')
        expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
                              'B': [9, nan, 5, 2, 5, 4, 5]},
                             index=[1, 2, 3, 4, 5, 6, nan])
        assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=True, na_position='first'
        sorted_df = df.sort_index(na_position='first')
        expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
                              'B': [5, 9, nan, 5, 2, 5, 4]},
                             index=[nan, 1, 2, 3, 4, 5, 6])
        assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=False, na_position='last'
        sorted_df = df.sort_index(kind='quicksort', ascending=False)
        expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
                              'B': [4, 5, 2, 5, nan, 9, 5]},
                             index=[6, 5, 4, 3, 2, 1, nan])
        assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=False, na_position='first'
        sorted_df = df.sort_index(
            kind='quicksort', ascending=False, na_position='first')
        expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
                              'B': [5, 4, 5, 2, 5, nan, 9]},
                             index=[nan, 6, 5, 4, 3, 2, 1])
        assert_frame_equal(sorted_df, expected)
Example #30
0
        tuples)  #multiindexing the columns with tuple values

    df1.index.names = iname  #naming the index

    df1.dropna(inplace=True)  #dropping null vvalues
    df1.head(10)

    #reshaping the table
    df1 = df1.stack()
    df1 = df1.stack()
    df1 = df1.stack()

    df1 = DataFrame(df1, columns=['NUMBER'])  #naming the value column
    df1 = df1.swaplevel(-3, -1, axis=0)  #ordering the indexes

    df1.sort_index(level=-4, axis=0, ascending=True,
                   inplace=True)  #sorting the index

    df1.index.rename(['Total', 'Different Residence', 'Different County'],
                     level=[-3, -2, -1],
                     inplace=True)  #renaming the indexes which left unnamed
    df1.head(20)

    #repeating all the steps for df2
    cname5 = df2.iloc[0, :1]

    df2.drop([86, 87], axis=0, inplace=True)
    df2.drop([10], axis=1, inplace=True)

    df2.set_index([0], inplace=True)
    df2.columns = pd.MultiIndex.from_tuples(tuples)