Beispiel #1
0
    def search(self,
               df,
               num_examples_per_instance,
               minimum_data=None,
               gap=None,
               drop_empty=True,
               label_type=None,
               verbose=True,
               *args,
               **kwargs):
        """Searches the data to calculates labels.

        Args:
            df (DataFrame): Data frame to search and extract labels.
            num_examples_per_instance (int or dict): The expected number of examples to return from each entity group.
                A dictionary can be used to further specify the expected number of examples to return from each label.
            minimum_data (str): Minimum data before starting search. Default value is first time of index.
            gap (str or int): Time between examples. Default value is window size.
                If an integer, search will start on the first event after the minimum data.
            drop_empty (bool): Whether to drop empty slices. Default value is True.
            label_type (str): The label type can be "continuous" or "categorical". Default value is the inferred label type.
            verbose (bool): Whether to render progress bar. Default value is True.
            *args: Positional arguments for labeling function.
            **kwargs: Keyword arguments for labeling function.

        Returns:
            lt (LabelTimes): Calculated labels with cutoff times.
        """
        assert self.labeling_function, 'missing labeling function(s)'
        self._check_example_count(num_examples_per_instance, gap)
        self.window_size = self.window_size or len(df)
        gap = to_offset(gap or self.window_size)

        is_label_search = isinstance(num_examples_per_instance, dict)
        search = (LabelSearch if is_label_search else ExampleSearch)(num_examples_per_instance)

        records = self._run_search(
            df=df,
            search=search,
            gap=gap,
            min_data=minimum_data,
            drop_empty=drop_empty,
            verbose=verbose,
            *args,
            **kwargs,
        )

        lt = LabelTimes(
            data=records,
            target_columns=list(self.labeling_function),
            target_entity=self.target_entity,
            search_settings={
                'num_examples_per_instance': num_examples_per_instance,
                'minimum_data': str(minimum_data),
                'window_size': str(self.window_size),
                'gap': str(gap),
            },
        )

        return lt
Beispiel #2
0
    def slice(self, df, num_examples_per_instance, minimum_data=None, gap=None, drop_empty=True, verbose=False):
        """Generates data slices of target entity.

        Args:
            df (DataFrame): Data frame to create slices on.
            num_examples_per_instance (int): Number of examples per unique instance of target entity.
            minimum_data (str): Minimum data before starting search. Default value is first time of index.
            gap (str or int): Time between examples. Default value is window size.
                If an integer, search will start on the first event after the minimum data.
            drop_empty (bool): Whether to drop empty slices. Default value is True.
            verbose (bool): Whether to print metadata about slice. Default value is False.

        Returns:
            ds (generator): Returns a generator of data slices.
        """
        self._check_example_count(num_examples_per_instance, gap)
        self.window_size = self.window_size or len(df)
        gap = to_offset(gap or self.window_size)
        groups = self.set_index(df).groupby(self.target_entity)

        if num_examples_per_instance == -1:
            num_examples_per_instance = float('inf')

        for key, df in groups:
            slices = self._slice(df=df, gap=gap, min_data=minimum_data, drop_empty=drop_empty)

            for ds in slices:
                ds.context.target_instance = key
                if verbose: print(ds)
                yield ds

                if ds.context.slice_number >= num_examples_per_instance:
                    break
Beispiel #3
0
    def _set_window_size(self, window_size):
        """Set and format initial window size parameter.

        Args:
            window_size (str or int): Duration of each data slice.
                The default value for window size is all future data.
        """
        if window_size is not None:
            window_size = to_offset(window_size)

        self.window_size = window_size
Beispiel #4
0
    def slice(self,
              df,
              num_examples_per_instance,
              minimum_data=None,
              gap=None,
              drop_empty=True,
              verbose=False):
        """Generates data slices of target entity.

        Args:
            df (DataFrame) : Data frame to create slices on.
            num_examples_per_instance (int) : Number of examples per unique instance of target entity.
            minimum_data (str) : Minimum data before starting search. Default value is first time of index.
            gap (str or int) : Time between examples. Default value is window size.
                If an integer, search will start on the first event after the minimum data.
            drop_empty (bool) : Whether to drop empty slices. Default value is True.
            verbose (bool) : Whether to print metadata about slice. Default value is False.

        Returns:
            DataSlice : Returns data slice.
        """
        if self.window_size is None and gap is None:
            more_than_one = num_examples_per_instance > 1
            assert not more_than_one, "must specify gap if num_examples > 1 and window size = none"

        self.window_size = self.window_size or len(df)
        gap = to_offset(gap or self.window_size)

        df = self.set_index(df)

        if num_examples_per_instance == -1:
            num_examples_per_instance = float('inf')

        for group in df.groupby(self.target_entity):
            slices = self._get_slices(group=group,
                                      gap=gap,
                                      min_data=minimum_data,
                                      drop_empty=drop_empty)

            for df in slices:
                if verbose:
                    print(df)

                yield df

                if df.context.slice_number >= num_examples_per_instance:
                    break
Beispiel #5
0
    def __init__(self,
                 target_entity,
                 time_index,
                 labeling_function,
                 window_size=None,
                 label_type=None):
        """Creates an instance of label maker.

        Args:
            target_entity (str) : Entity on which to make labels.
            time_index (str): Name of time column in the data frame.
            labeling_function (function) : Function that transforms a data slice to a label.
            window_size (str or int) : Duration of each data slice.
                The default value for window size is all future data.
        """
        self.target_entity = target_entity
        self.time_index = time_index
        self.labeling_function = labeling_function
        self.window_size = window_size

        if self.window_size is not None:
            self.window_size = to_offset(self.window_size)
Beispiel #6
0
    def _get_slices(self, group, gap=None, min_data=None, drop_empty=True):
        """Generate data slices for group.

        Args:
            df (DataFrame) : Data frame to generate data slices.
            gap (str or int) : Time between examples. Default value is window size.
                If an integer, search will start on the first event after the minimum data.
            min_data (int or str or Timestamp) : Threshold to cutoff data.
            drop_empty (bool) : Whether to drop empty slices. Default value is True.

        Returns:
            DataSlice : Returns a data slice.
        """
        key, df = group
        self.window_size = self.window_size or len(df)
        gap = to_offset(gap or self.window_size)

        df = df.loc[df.index.notnull()]
        assert df.index.is_monotonic_increasing, "Please sort your dataframe chronologically before calling search"

        if df.empty:
            return

        threshold = min_data or df.index[0]
        df, cutoff_time = cutoff_data(df=df, threshold=threshold)

        if df.empty:
            return

        if isinstance(gap, int):
            cutoff_time = df.index[0]

        df = DataSlice(df)
        df.context = Context(slice_number=0,
                             target_entity=self.target_entity,
                             target_instance=key)

        def iloc(index, i):
            if i < index.size:
                return index[i]

        while not df.empty and cutoff_time <= df.index[-1]:
            if isinstance(self.window_size, int):
                df_slice = df.iloc[:self.window_size]
                window_end = iloc(df.index, self.window_size)

            else:
                window_end = cutoff_time + self.window_size
                df_slice = df[:window_end]

                # Pandas includes both endpoints when slicing by time.
                # This results in the right endpoint overlapping in consecutive data slices.
                # Resolved by making the right endpoint exclusive.
                # https://pandas.pydata.org/pandas-docs/version/0.19/gotchas.html#endpoints-are-inclusive

                if not df_slice.empty:
                    is_overlap = df_slice.index == window_end

                    if df_slice.index.size > 1 and is_overlap.any():
                        df_slice = df_slice[~is_overlap]

            df_slice.context.window = (cutoff_time, window_end)

            if isinstance(gap, int):
                gap_end = iloc(df.index, gap)
                df_slice.context.gap = (cutoff_time, gap_end)
                df = df.iloc[gap:]

                if not df.empty:
                    cutoff_time = df.index[0]

            else:
                gap_end = cutoff_time + gap
                df_slice.context.gap = (cutoff_time, gap_end)
                cutoff_time += gap

                if cutoff_time <= df.index[-1]:
                    df = df[cutoff_time:]

            if df_slice.empty and drop_empty:
                continue

            df.context.slice_number += 1

            yield df_slice