Esempio n. 1
0
def roll_multi_result(df: pd.DataFrame, apply_func: callable, window: int, return_col_num: int, **kwargs):
    """
    rolling with multiple columns on 2 dim pd.Dataframe
    * the result can apply the function which can return pd.Series with multiple columns

    call apply function with numpy ndarray
    
    :param return_col_num: apply_func返回的个数(列数)
    :param apply_func: --注意:传递的参数,前N个为原index,N=index的维数
    :param df: [pd.DataFrame,pd.Series]
    :param window: 滚动窗口
    :param kwargs: 
    :return:
    """

    # move index to values
    v = df.reset_index().values

    dim0, dim1 = v.shape
    stride0, stride1 = v.strides

    stride_values = stride(v, (dim0 - (window - 1), window, dim1), (stride0, stride0, stride1))

    result_values = np.full((dim0, return_col_num), np.nan)

    for idx, values in enumerate(stride_values, window - 1):
        # values : col 1 is index, other is value
        result_values[idx,] = apply_func(values, **kwargs)

    return result_values
def roll(df: pd.DataFrame, w: int, **kwargs):
    """
    Rolling window on dataframe using multiple columns
    
    >>> roll(pd.DataFrame(np.random.randn(10,3), index=list('ABCDEFGHIJ')), 3).apply(print)
    
    or alternatively 
    
    >>> pd.DataFrame(np.random.randn(10,3), index=list('ABCDEFGHIJ')).pipe(roll, 3).apply(lambda x: print(x[2]))
    
    :param df: pandas DataFrame
    :param w: window size (only integers)
    :return: rolling window
    """
    if w > len(df):
        raise ValueError("Window size exceeds number of rows !")

    v = df.values
    d0, d1 = v.shape
    s0, s1 = v.strides
    a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))
    rolled_df = pd.concat({
        row: pd.DataFrame(values, columns=df.columns)
        for row, values in zip(df.index, a)
    })

    return rolled_df.groupby(level=0, **kwargs)
Esempio n. 3
0
def roll(df, w, **kwargs):
    v = df.values
    d0, d1 = v.shape
    s0, s1 = v.strides

    a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))

    rolled_df = pd.concat({
        row: pd.DataFrame(values, columns=df.columns)
        for row, values in zip(df.index, a)
    })

    return rolled_df.groupby(level=0, **kwargs)
Esempio n. 4
0
def makeS(full, train, var):
    """
    Given 2 dataframes, "train" which is a subset of "full" used to train a
    model, and a variable name, makes a new matrix of a row length(n) equal to
    the row length of the training data frame and a column length(m) equal to
    the row length of the full data frame. Each column is a repeat of the
    values of the selected variable from the training data frame. The new
    matrix created however only takes up the memory space of what a single
    column does.
    """
    ncol = len(full)
    vec = train[var].values
    return stride(vec, shape=(len(vec), ncol), strides=(vec.itemsize, 0))
Esempio n. 5
0
def roll(df, w, **kwargs):
    # returns iterable of DataFrames each having some length (for window-type functions)
    v = df.values
    d0, d1 = v.shape
    s0, s1 = v.strides

    a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))
    if len(a) == 0:
        return np.array([])
    rolled_df = pd.concat({
        row: pd.DataFrame(values, columns=df.columns)
        for row, values in zip(df.index, a)
    })
    return rolled_df.groupby(level=0, **kwargs)
Esempio n. 6
0
def roll(df, w, **kwargs):
    """ Helper function to apply rolling calculation to multiple columns """
    
    v = df.values
    d0, d1 = v.shape
    s0, s1 = v.strides

    a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))

    rolled_df = pd.concat({
        row: pd.DataFrame(values, columns=df.columns)
        for row, values in zip(df.index, a)
    })

    return rolled_df.groupby(level=0, **kwargs)
Esempio n. 7
0
def roll(df, w, **kwargs):
    """
    Roll a df.
    Input: df -> df to be rolled
           w  -> the length of the rolling window
    """
    v = df.values
    d0, d1 = v.shape
    s0, s1 = v.strides

    a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1))

    rolled_df = pd.concat({
        row: pd.DataFrame(values, columns=df.columns)
        for row, values in zip(df.index, a)
    })

    return rolled_df.groupby(level=0, **kwargs)
Esempio n. 8
0
    def _transform_all_data(self):
        self._log.debug("transforming NGSIM data")
        df = pd.read_csv(
            self._path,
            sep=r"\s+",
            header=None,
            names=(
                "vehicle_id",
                "frame_id",  # 1 frame per .1s
                "total_frames",
                "global_time",  # msecs
                # front center in feet from left lane edge
                "position_x" if not self._swap_xy else "position_y",
                # front center in feet from entry edge
                "position_y" if not self._swap_xy else "position_x",
                "global_x" if not self._swap_xy else "global_y",  # front center in feet
                "global_y" if not self._swap_xy else "global_x",  # front center in feet
                "length",  # feet
                "width",  # feet
                "type",  # 1 = motorcycle, 2 = auto, 3 = truck
                "speed",  # feet / sec
                "acceleration",  # feet / sec^2
                "lane_id",  # lower is further left
                "preceding_vehicle_id",
                "following_vehicle_id",
                "spacing",  # feet
                "headway",  # secs
            ),
        )

        df["sim_time"] = df["global_time"] - min(df["global_time"])

        # offset of the map from the data...
        x_offset = self._dataset_spec.get("x_offset_px", 0) / self.scale
        y_offset = self._dataset_spec.get("y_offset_px", 0) / self.scale

        df["length"] *= METERS_PER_FOOT
        df["width"] *= METERS_PER_FOOT
        df["speed"] *= METERS_PER_FOOT
        df["acceleration"] *= METERS_PER_FOOT
        df["spacing"] *= METERS_PER_FOOT
        df["position_y"] *= METERS_PER_FOOT
        # SMARTS uses center not front
        df["position_x"] = (
            df["position_x"] * METERS_PER_FOOT - 0.5 * df["length"] - x_offset
        )
        if y_offset:
            df["position_x"] = df["position_y"] - y_offset

        if self._flip_y:
            max_y = self._dataset_spec["map_net"]["max_y"]
            df["position_y"] = (max_y / self.scale) - df["position_y"]

        # Use moving average to smooth positions...
        df.sort_values("sim_time", inplace=True)  # just in case it wasn't already...
        k = 15  # kernel size for positions
        for vehicle_id in set(df["vehicle_id"]):
            same_car = df["vehicle_id"] == vehicle_id
            df.loc[same_car, "position_x"] = (
                df.loc[same_car, "position_x"]
                .rolling(window=k)
                .mean()
                .shift(1 - k)
                .values
            )
            df.loc[same_car, "position_y"] = (
                df.loc[same_car, "position_y"]
                .rolling(window=k)
                .mean()
                .shift(1 - k)
                .values
            )
            # and compute heading with (smaller) rolling window (=3) too..
            v = df.loc[same_car, ["position_x", "position_y"]].shift(1).values
            d0, d1 = v.shape
            s0, s1 = v.strides
            headings = [
                self._cal_heading(values)
                for values in stride(v, (d0 - 2, 3, d1), (s0, s0, s1))
            ]
            df.loc[same_car, "heading_rad"] = headings + [headings[-1], headings[-1]]
            # ... and new speeds (based on these smoothed positions)
            # (This also overcomes problem that NGSIM speeds are "instantaneous"
            # and so don't match with dPos/dt, which can affect some models.)
            speeds = [
                self._cal_speed(values)
                for values in stride(v, (d0 - 2, 3, d1), (s0, s0, s1))
            ]
            df.loc[same_car, "speed_discrete"] = speeds + [None, None]

        map_width = self._dataset_spec["map_net"].get("width")
        if map_width:
            valid_x = (df["position_x"] * self.scale).between(0, map_width)
            df = df[valid_x]

        return df
Esempio n. 9
0
    def _transform_all_data(self):
        self._log.debug("transforming NGSIM data")
        cols = (
            "vehicle_id",
            "frame_id",  # 1 frame per .1s
            "total_frames",
            "global_time",  # msecs
            # front center in feet from left lane edge
            "position_x" if not self._swap_xy else "position_y",
            # front center in feet from entry edge
            "position_y" if not self._swap_xy else "position_x",
            "global_x"
            if not self._swap_xy else "global_y",  # front center in feet
            "global_y"
            if not self._swap_xy else "global_x",  # front center in feet
            "length",  # feet
            "width",  # feet
            "type",  # 1 = motorcycle, 2 = auto, 3 = truck
            "speed",  # feet / sec
            "acceleration",  # feet / sec^2
            "lane_id",  # lower is further left
            "preceding_vehicle_id",
            "following_vehicle_id",
            "spacing",  # feet
            "headway",  # secs
        )
        if self._dataset_spec.get("source") == "NGSIM2":
            extra_cols = (
                "origin_zone",
                "destination_zone",
                "intersection",
                "section",
                "direction",
                "movement",
            )
            cols = cols[:16] + extra_cols + cols[16:]
        df = pd.read_csv(self._path, sep=r"\s+", header=None, names=cols)

        df["sim_time"] = df["global_time"] - min(df["global_time"])

        # offset of the map from the data...
        x_margin = self._dataset_spec.get("x_margin_px", 0) / self.scale
        y_margin = self._dataset_spec.get("y_margin_px", 0) / self.scale

        df["length"] *= METERS_PER_FOOT
        df["width"] *= METERS_PER_FOOT
        df["speed"] *= METERS_PER_FOOT
        df["acceleration"] *= METERS_PER_FOOT
        df["spacing"] *= METERS_PER_FOOT
        df["position_x"] *= METERS_PER_FOOT
        df["position_y"] *= METERS_PER_FOOT
        if x_margin:
            df["position_x"] = df["position_x"] - x_margin
        if y_margin:
            df["position_x"] = df["position_y"] - y_margin

        if self._flip_y:
            max_y = self._dataset_spec["map_net"]["max_y"]
            df["position_y"] = (max_y / self.scale) - df["position_y"]

        # Use moving average to smooth positions...
        df.sort_values("sim_time",
                       inplace=True)  # just in case it wasn't already...
        k = 15  # kernel size for positions
        for vehicle_id in set(df["vehicle_id"]):
            same_car = df["vehicle_id"] == vehicle_id
            df.loc[same_car,
                   "position_x"] = (df.loc[same_car, "position_x"].rolling(
                       window=k).mean().shift(1 - k).values)
            df.loc[same_car,
                   "position_y"] = (df.loc[same_car, "position_y"].rolling(
                       window=k).mean().shift(1 - k).values)
            # and compute heading with (smaller) rolling window (=3) too..
            shift = int(self._heading_window / 2)
            pad = self._heading_window - shift - 1
            v = df.loc[same_car, ["position_x", "position_y", "speed"]].values
            v = np.insert(v, 0, [[np.nan, np.nan, np.nan]] * shift, axis=0)
            headings = [
                self._cal_heading(values)
                for values in sliding_window_view(v, (self._heading_window, 3))
            ]
            df.loc[same_car, "heading_rad"] = headings + [headings[-1]] * pad
            # ... and new speeds (based on these smoothed positions)
            # (This also overcomes problem that NGSIM speeds are "instantaneous"
            # and so don't match with dPos/dt, which can affect some models.)
            v = df.loc[same_car, ["position_x", "position_y"]].shift(1).values
            d0, d1 = v.shape
            s0, s1 = v.strides
            speeds = [
                self._cal_speed(values)
                for values in stride(v, (d0 - 2, 3, d1), (s0, s0, s1))
            ]
            df.loc[same_car, "speed_discrete"] = speeds + [None, None]

        # since SMARTS' positions are the vehicle centerpoints, but NGSIM's are at the front,
        # now adjust the vehicle position to its centerpoint based on its angle (+y = 0 rad)
        df["position_x"] = df["position_x"] - 0.5 * df["length"] * np.cos(
            df["heading_rad"] + 0.5 * math.pi)
        df["position_y"] = df["position_y"] - 0.5 * df["length"] * np.sin(
            df["heading_rad"] + 0.5 * math.pi)

        map_width = self._dataset_spec["map_net"].get("width")
        if map_width:
            valid_x = (df["position_x"] * self.scale).between(
                df["length"] / 2, map_width - df["length"] / 2)
            df = df[valid_x]

        return df