def to_timeseries(self): ''' Merge the collection of groups in the GroupedTimeSeries into a single Timeseries. Returns ------- out : Timeseries:w Examples -------- .. sourcecode:: python >>> data = grouped_ts.to_timeseries() +---------------------+-------+ | index | value | +---------------------+-------+ | 2015-01-01 00:00:00 | 0 | | 2015-01-02 00:00:00 | 1 | | 2015-01-03 00:00:00 | 0 | | 2015-01-04 00:00:00 | 1 | | 2015-01-05 00:00:00 | 0 | | 2015-01-06 00:00:00 | 1 | | 2015-01-07 00:00:00 | 0 | | 2015-01-08 00:00:00 | 1 | | 2015-01-09 00:00:00 | 0 | | 2015-01-10 00:00:00 | 1 | +---------------------+-------+ [366 rows x 2 columns] ''' _mt._get_metric_tracker().track('grouped_timeseries.to_timeseries') return _graphlab.TimeSeries(self._grouped_ts.sframe, index=self.index_col_name)
def get(self, field): """ Return the value contained in the model's ``field``. Parameters ---------- field : string Name of the field to be retrieved. Returns ------- out Value of the requested field. See Also -------- list_fields """ _mt._get_metric_tracker().track( 'toolkits.anomaly_detection.bayesian_changepoints.get') if field == "scores" and self.__proxy__.get( 'dataset_type') == 'TimeSeries': ts = self.__proxy__.get('scores') return _gl.TimeSeries(ts, index=self.__proxy__.get_index_col_name()) else: return self.__proxy__.get(field)
def generator(): elems_at_a_time = 16 self._grouped_ts.begin_iterator() ret = self._grouped_ts.iterator_get_next(elems_at_a_time) while (True): for j in ret: try: j[1].remove_columns(self._temp_col_names) except KeyError: pass j[1] = _graphlab.TimeSeries(j[1], self.index_col_name, is_sorted=True) yield tuple(j) if len(ret) == elems_at_a_time: ret = self._grouped_ts.iterator_get_next(elems_at_a_time) else: break
def _load_version(self, unpickler, version): """ A function to load a previously saved MovingZScoreModel instance. Parameters ---------- unpickler : GLUnpickler A GLUnpickler file handler. version : int Version number maintained by the class writer. """ state = unpickler.load() if state['dataset_type'] == 'TimeSeries': state['scores'] = _gl.TimeSeries(state['scores'], index=state['index_col_name']) state.pop('index_col_name') if version == 0: state['min_observations'] = None return MovingZScoreModel(state)
def _save_impl(self, pickler): """ Save the model as a directory, which can be loaded with the :py:func:`~graphlab.load_model` method. Parameters ---------- pickler : GLPickler An opened GLPickle archive (Do not close the archive). See Also -------- graphlab.load_model Examples -------- >>> model.save('my_model_file') >>> loaded_model = graphlab.load_model('my_model_file') """ ## The GL pickler does not support TimeSeries, so we need to convert # and un-convert to SFrame here. Furthermore, the proxy does not # support copying, so we need to change proxy itself, then change it # back. if self.__proxy__['dataset_type'] == 'TimeSeries': self.__proxy__['index_col_name'] = self.__proxy__['scores'].index_col_name self.__proxy__['scores'] = self.__proxy__['scores'].to_sframe() pickler.dump(self.__proxy__) self.__proxy__['scores'] = _gl.TimeSeries(self.__proxy__['scores'], index=self.__proxy__['index_col_name']) self.__proxy__.pop('index_col_name') else: pickler.dump(self.__proxy__)
def get_group(self, name): """ Get the TimeSeries associated with the group `name`. The name of the group corresponds to the distinct value in the column(s) that the group was performed on. Check the output of `graphlab.timeseries.GroupedTimeSeries.groups` for all available group names. Parameters ---------- name : type | list Name of the group(s). If more than one column, the name is a list of the values of the group, in the same order that they were expressed to the group call. Returns ------- ts : `graphlab.TimeSeries` Examples -------- >>> import datetime as dt >>> start = dt.datetime(2013, 5, 7) >>> end = dt.datetime(2013, 5, 9, 23, 59, 59) >>> sa = gl.TimeSeries.date_range(start,end,dt.timedelta(hours=12)) >>> sf = gl.SFrame({'time':sa, ... 'numbers':[(i % 2) for i in range(0,len(sa))], ... 'words':['day' if (i % 2) else 'night' for i in range(0,len(sa))]}) # Create a timeseries. >>> ts = gl.TimeSeries(sf, index='time') >>> print ts +---------------------+---------+-------+ | time | numbers | words | +---------------------+---------+-------+ | 2013-05-07 00:00:00 | 0 | night | | 2013-05-07 12:00:00 | 1 | day | | 2013-05-08 00:00:00 | 0 | night | | 2013-05-08 12:00:00 | 1 | day | | 2013-05-09 00:00:00 | 0 | night | | 2013-05-09 12:00:00 | 1 | day | +---------------------+---------+-------+ [6 rows x 3 columns] The index column of the TimeSeries is: time # Group the timeseries by hour. >>> by_hour = ts.group(ts.date_part.HOUR) >>> by_hour.get_group(12) +---------------------+---------+-------+ | time | numbers | words | +---------------------+---------+-------+ | 2013-05-07 12:00:00 | 1 | day | | 2013-05-08 12:00:00 | 1 | day | | 2013-05-09 12:00:00 | 1 | day | +---------------------+---------+-------+ [3 rows x 3 columns] The index column of the TimeSeries is: time >>> by_word = ts.group('words') >>> by_word.get_group('night') +---------------------+---------+-------+ | time | numbers | words | +---------------------+---------+-------+ | 2013-05-07 00:00:00 | 0 | night | | 2013-05-08 00:00:00 | 0 | night | | 2013-05-09 00:00:00 | 0 | night | +---------------------+---------+-------+ [3 rows x 3 columns] The index column of the TimeSeries is: time >>> by_num = ts.group('numbers') >>> by_num.get_group(1) +---------------------+---------+-------+ | time | numbers | words | +---------------------+---------+-------+ | 2013-05-07 12:00:00 | 1 | day | | 2013-05-08 12:00:00 | 1 | day | | 2013-05-09 12:00:00 | 1 | day | +---------------------+---------+-------+ [3 rows x 3 columns] The index column of the TimeSeries is: time >>> by_both = ts.group(['numbers','words']) >>> by_both.get_group([1, 'day']) +---------------------+---------+-------+ | time | numbers | words | +---------------------+---------+-------+ | 2013-05-07 12:00:00 | 1 | day | | 2013-05-08 12:00:00 | 1 | day | | 2013-05-09 12:00:00 | 1 | day | +---------------------+---------+-------+ [3 rows x 3 columns] The index column of the TimeSeries is: time >>> by_day = ts.group([ts.date_part.YEAR, ... ts.date_part.MONTH, ... ts.date_part.DAY]) >>> by_day.get_group([2013,5,9]) +---------------------+---------+-------+ | time | numbers | words | +---------------------+---------+-------+ | 2013-05-09 00:00:00 | 0 | night | | 2013-05-09 12:00:00 | 1 | day | +---------------------+---------+-------+ [2 rows x 3 columns] The index column of the TimeSeries is: time """ if not isinstance(name, list): name = [name] # HUGE hack to prevent list of ints from converting to list of floats # on C++ side name.append(None) src_sf = self._grouped_ts.get_group(name) try: src_sf.remove_columns(self._temp_col_names) except KeyError: pass return _graphlab.TimeSeries(src_sf, self.index_col_name, is_sorted=True)
'open'] # distance between Highest and Opening price data['lo'] = data['low'] - data[ 'open'] # distance between Lowest and Opening price data['gain'] = data['close'] - data['open'] # feature generation rsi_14 = RSI(14).generate(data) rsi_5 = RSI(5).generate(data) # rsi_14 = FeatureFactory.generate_rsi(data, n = 14) # rsi_5 = FeatureFactory.generate_rsi(data, n = 5) ma_20 = FeatureFactory.generate_moving_average(data, period=20) ma_5 = FeatureFactory.generate_moving_average(data, period=5) crossMA1_10 = crossMA(1, 10).generate(data) crossMA5_20 = crossMA(5, 20).generate(data) ts = gl.TimeSeries(data, index='datetime') # add the outcome variable, 1 if the bar was positive (close>open), 0 otherwise ts['outcome'] = ts.apply(lambda x: 1 if x['close'] > x['open'] else -1) # ts['ma5-20'] = ts[ma_5] - ts[ma_20] # GENERATE SOME LAGGED TIMESERIES ts_1 = ts.shift(1) # by 1 day ts['dRtn'] = ts['close'] / ts_1['close'] ts['idRtn'] = ts['close'] / ts['open'] ts_1 = ts.shift(1) # by 1 day ts_2 = ts.shift(2) ts_3 = ts.shift(3) ts['open_above_close'] = ts['open'] > ts_1['close'] ts['dRtn'] = ts['close'] / ts_1['close'] ts['idRtn'] = ts['close'] / ts['open'] ts['jump'] = ts['open'] > ts_1['high'] ts['aboveMA'] = ts['open'] > ts_1[ma_20]
import graphlab as gl import datetime as dt # household_data = gl.SFrame( # "https://static.turi.com/datasets/household_electric_sample/household_electric_sample.sf") # # household_data.save("household_data") ## household_data = gl.SFrame("household_data") print (household_data.head(10)) household_ts = gl.TimeSeries(household_data, index="DateTime") print (household_ts.head(10))
from dateutil import parser as datetime_parser ### Load Data ### # Table of product purchases purchases = gl.SFrame.read_csv('dataset/online_retail.csv') ### Prepare Data ### # Convert InvoiceDate strings (e.g. "12/1/10 8:26") to datetimes purchases['InvoiceDate'] = purchases['InvoiceDate'].apply(datetime_parser.parse) # Create a TimeSeries timeseries = gl.TimeSeries(purchases, 'InvoiceDate') ### Train the churn predictor model ### # Split the data into train and validation train, valid = gl.churn_predictor.random_split(timeseries, user_id='CustomerID', fraction=0.8, seed = 1) # A churn forecast requires a time boundary and a churn period. # Activity before the boundary is used to train the model. # After the boundary, activity (or lack of activity) # during the churn period is used to define whether the # user churned. # Train the model using data before August churn_boundary_oct = datetime.datetime(year = 2011, month = 8, day = 1)
def create(dataset, features=None, verbose=True): """ Create an anomaly detection model. Based on the type of the input data, this function automatically choose the anomaly detection model and the type of anomalies to search for. Generally speaking, if the input data appears to be a time series---if the dataset type is TimeSeries, one of the features is of type datetime.datetime, or there is only a single feature---the toolkit chooses the moving Z-score model. Parameters ---------- dataset : SFrame or TimeSeries Input dataset. Determines the type of anomaly detection model and types of anomalies to search for. features : list[str], optional Names of columns in the input 'dataset' to use as features. verbose : bool, optional If True, print progress updates and model details. Returns ------- model : GraphLab Create model See Also -------- local_outlier_factor.create, graphlab.toolkits.dbscan.create Examples -------- >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.], ... 'x1': [2., 1., 0., 1., 2., 1.5, 2.5]}) ... >>> m = graphlab.anomaly_detection.create(sf) >>> type(m) graphlab.toolkits.anomaly_detection.local_outlier_factor.LocalOutlierFactorModel ... >>> m['scores'] +--------+----------------------+ | row_id | local_outlier_factor | +--------+----------------------+ | 2 | 0.951567102896 | | 0 | 0.951567102896 | | 5 | 1.00783754045 | | 4 | 0.982224576307 | | 3 | 1.05829898642 | | 1 | 1.05829898642 | | 6 | 2.52792223974 | +--------+----------------------+ [7 rows x 2 columns] """ _mt._get_metric_tracker().track('toolkit.anomaly_detection.create') ## Basic validation of the input dataset. if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1 or len(dataset.column_names()) < 1: raise TypeError("Input 'dataset' is empty.") ## Figure out the features and do basic validation. if features is None: features = dataset.column_names() if (not isinstance(features, list) or not all([type(c) == str for c in features])): raise TypeError("If specified, input 'features' must be a list " + "of strings.") if not all([c in dataset.column_names() for c in features]): raise _ToolkitError("The specified features could not all be found " + "in the input 'dataset'.") ## If any valid features are datetime types LOF is not valid. ## If there is more than one feature Z-score is not valid. # Figure out if there is a datetime column. col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } datetime_features = [c for c in features if col_types[c] == _dt.datetime] value_features = [c for c in features if col_types[c] != _dt.datetime] ## Decide which model to use. try_zscore = False if isinstance(dataset, _gl.TimeSeries): try_zscore = True else: # dataset is an SFrame if len(datetime_features) > 0: try_zscore = True if len(value_features) == 1 and (col_types[value_features[0]] in (int, float)): try_zscore = True ## Create the relevant model. bandwidth = max(1, int(0.05 * len(dataset))) if try_zscore: if len(value_features) != 1 or len(datetime_features) > 1: raise _ToolkitError( "Cannot select an appropriate anomaly " + "detection model. For a " + "local outlier factor model, please remove " + "any datetime-type features. For a moving" + "Z-score model, please identify one data" + "feature (integer- or float-type) and at most" + "one datetime column as an index (this indexing is done" + "automatically for TimeSeries objects)") if isinstance(dataset, _gl.SFrame) and len(datetime_features) == 1: _dataset = _gl.TimeSeries(dataset, index=datetime_features[0]) else: _dataset = dataset[:] if verbose: print("Creating a moving Z-score anomaly detection model.") model = _gl.moving_zscore.create(dataset=_dataset, feature=value_features[0], window_size=bandwidth, verbose=verbose) ## If not doing the moving z-score, do local outlier factor. else: if verbose: print("Creating a local outlier factor model.") model = _gl.local_outlier_factor.create(dataset=dataset, features=features, num_neighbors=bandwidth, verbose=verbose) return model
# just to check if data is sorted in ascending mode qq.head(3) qq.save(“SP500_daily.bin”) # once data is saved, we can use the following instruction to retrieve it qq = gl.SFrame(“SP500_daily.bin/”) # add the outcome variable, 1 if the trading session was positive (close>open), 0 otherwise qq['outcome'] = qq.apply(lambda x: 1 if x['close'] > x['open'] else -1) # we also need to add three new columns ‘ho’ ‘lo’ and ‘gain’ # they will be useful to backtest the model, later qq['ho'] = qq['high'] - qq['open'] # distance between Highest and Opening price qq['lo'] = qq['low'] - qq['open'] # distance between Lowest and Opening price qq['gain'] = qq['close'] - qq['open'] ts = gl.TimeSeries(qq, index='datetime') # add the outcome variable, 1 if the bar was positive (close>open), 0 otherwise ts['outcome'] = ts.apply(lambda x: 1 if x['close'] > x['open'] else -1) # GENERATE SOME LAGGED TIMESERIES ts_1 = ts.shift(1) # by 1 day ts_2 = ts.shift(2) # by 2 days # ...etc.... # it's an arbitrary decision how many days of lag are needed to create a good forecaster, so # everyone can experiment by his own decision # add_features is a helper function, which is out of the scope of this article, # and it returns a tuple with: # ts: a timeseries object with, in addition to the already included columns, also lagged columns # as well as some features added to train the model, as shown above with feat1 and feat2 examples # l_features: a list with all features used to train Classifier models
# g1 = gn['genre1'].unique() # g2 = gn['genre2'].unique() # g3 = gn['genre3'].unique() genres = { 'genre1': gn['genre1'].unique(), 'genre2': gn['genre2'].unique(), 'genre3': gn['genre3'].unique() } joined = scrobbles.join(gn, on='songID') joined['ts'] = joined['ts'].apply(lambda x: parse(x)) ts = gl.TimeSeries(joined, index='ts') ts.save('ts') total_listens = ts.resample(dt.timedelta(days=1), agg.COUNT()) total_listens.save(rootdir + '_total_listens') for level in ('genre1', 'genre2', 'genre3'): n = len(genres[level]) for i, genre in enumerate(genres[level]): current = ts[ts[level] == genre].resample(dt.timedelta(days=1), agg.COUNT()) #current.save(rootdir+level+'_'+genre) current.to_sframe().to_dataframe().to_pickle(rootdir + level + '_' + genre.replace('/', '-') + '.pkl')
def update(self, dataset, window_size=None, min_observations=None, verbose=True): """ Create a new `MovingZScoreModel` with a new dataset. The `window_size` and `min_observations` parameters can also be updated with this method. The new model contains anomaly scores for each observation in the new `dataset`. In addition, the last `window_size` rows of the existing model's data and anomaly scores are prepended, for continuity and to show how the anomaly score is computed for the first few rows of the new `dataset`. Parameters ---------- dataset : SFrame or TimeSeries New data to use for updating the model. The type of the input 'dataset' must match the type of the data already in the model (if the model has data already). window_size : int, optional Length of the time window to use for defining the moving z-score value, in terms of number of observations. The window size will be the same as the current model's window size if a new window is not specified. min_observations : int, optional Minimum number of non-missing observations in the moving window required to compute the moving Z-score. If unspecified, the entire moving window preceding an observation must not contain any missing values in order for the observation to get an anomaly score. This parameter will be the same as the current model's value if not specified. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : MovingZScoreModel A *new* MovingZScoreModel, with an updated dataset and anomaly scores for the updated dataset. The `scores` field of the new model has the same schema as the `scores` field of the existing model, but data prepended from the existing results have a row ID of 'None'. See Also -------- create Examples -------- >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010], ... 'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]}) >>> model = graphlab.anomaly_detection.moving_zscore.create(sf, ... window_size=3, ... feature='value') ... >>> sf2 = graphlab.SFrame({'year': [2010, 2011, 2012, 2013], ... 'value': [18.4, 12.1, 12.0, 3.6]}) >>> model2 = model.update(sf2) >>> model2['scores'].print_rows(max_column_width=20) +--------+----------------+-------+----------------+---------------------+ | row_id | anomaly_score | value | moving_average | model_update_time | +--------+----------------+-------+----------------+---------------------+ | None | 28.0822407386 | 21.4 | 12.1333333333 | 2016-01-04 16:58... | | None | 1.00086199482 | 10.8 | 15.2 | 2016-01-04 16:58... | | None | 0.795990414837 | 11.2 | 14.9 | 2016-01-04 16:58... | | 0 | 0.801849542822 | 18.4 | 14.4666666667 | 2016-01-04 16:58... | | 1 | 0.391346818515 | 12.1 | 13.4666666667 | 2016-01-04 16:58... | | 2 | 0.593171014002 | 12.0 | 13.9 | 2016-01-04 16:58... | | 3 | 3.52963789428 | 3.6 | 14.1666666667 | 2016-01-04 16:58... | +--------+----------------+-------+----------------+---------------------+ [7 rows x 5 columns] """ start_time = _time.time() _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.moving_zscore.update') logger = _logging.getLogger(__name__) ## Validate the new dataset if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1: raise TypeError("Input 'dataset' is empty.") if ((self.__proxy__['dataset_type'] == 'TimeSeries' and not isinstance(dataset, _gl.TimeSeries)) or (self.__proxy__['dataset_type'] == 'SFrame' and not isinstance(dataset, _gl.SFrame))): raise TypeError("New input 'dataset' must have the same type " + "as the data already in the model.") ## Validate the new window size (if there is one), and figure out what # the new window size will be. if window_size is None: window_size = self.__proxy__['window_size'] else: if not isinstance(window_size, int): raise TypeError("Input 'window_size' must be an integer.") if window_size < 1: raise ValueError("Input 'window_size' must greater than or " + "equal to 1.") ## Validate and determine the `min_observations` parameter. if min_observations is None: min_observations = self.__proxy__['min_observations'] else: if not isinstance(min_observations, int): raise TypeError("If specified, input 'min_observations' must " + "be a positive integer.") if min_observations < 1: raise ValueError("If specified, input 'min_observations' must " + "be a positive integer.") ## TimeSeries-specific dataset validation ## Make the sure new data occurs *after* the existing data. scores = self.__proxy__['scores'] if isinstance(dataset, _gl.TimeSeries): first_new_timestamp = dataset[0][dataset.index_col_name] last_old_timestamp = scores[-1][scores.index_col_name] if first_new_timestamp < last_old_timestamp: raise _ToolkitError("The new dataset has data with " + "earlier timestamps than the existing " + "dataset. Please ensure that new data " + "occurs after existing data.") ## Extract the feature from the new dataset and validate it. feature = self.__proxy__['feature'] try: series = dataset[feature] except: raise _ToolkitError("The feature specified by the original " + "model could not be found in the input " + "'dataset'.") if not series.dtype() in [int, float]: raise ValueError("The values in the specified feature must be " + "integers or floats.") ## Create a new model and cut the old score object to the window size. new_state = {k: self.__proxy__[k] for k in ['verbose', 'feature', 'dataset_type']} new_state['window_size'] = window_size new_state['min_observations'] = min_observations new_model = MovingZScoreModel(new_state) ## Save just the old data needed for the moving statistics on the new # data. if len(scores) < window_size: old_scores = scores[:] else: old_scores = scores[-window_size:] ## Compute Z-scores and anomaly scores. series = old_scores[feature].append(series) moving_average, moving_zscore, sufficient_data = \ _moving_z_score(series, window_size, min_observations) anomaly_score = abs(moving_zscore) if not sufficient_data: logger.warning("The number of observations is smaller than " + "the minimum number needed to compute a " + "moving Z-score, so all anomaly scores are 'None'. " + "Consider adding more data with the model's `update` " + "method, or reducing the `window_size` or " + "`min_observations` parameters.") ## General post-processing and formatting. scores = _gl.SFrame({feature: series, 'moving_average': moving_average, 'anomaly_score': anomaly_score}) scores['model_update_time'] = _dt.datetime.now() scores = scores[[feature, # reorder the columns 'moving_average', 'anomaly_score', 'model_update_time']] ## Replace the new Z-scores for the *old* data with the original # Z-score for that data. num_new_examples = len(dataset) new_scores = scores[-num_new_examples:] if isinstance(dataset, _gl.TimeSeries): new_scores[dataset.index_col_name] = dataset[dataset.index_col_name] new_scores = _gl.TimeSeries(new_scores, index=dataset.index_col_name) ## The index column should have the same name in the old and new # data. If it doesn't, change the name in the old scores. if dataset.index_col_name != old_scores.index_col_name: old_scores = old_scores.rename( {old_scores.index_col_name: dataset.index_col_name}) if verbose: logger.warning("The new dataset's index column name " + "does not match the existing index " + "column name. The new name is used in " + "the new model.") final_scores = old_scores.union(new_scores) else: new_scores = new_scores.add_row_number('row_id') old_scores['row_id'] = None old_scores['row_id'] = old_scores['row_id'].astype(int) final_scores = old_scores.append(new_scores) ## Finalize and return the model. new_model.__proxy__['num_examples'] = len(scores) new_model.__proxy__['scores'] = final_scores new_model.__proxy__['training_time'] = _time.time() - start_time return new_model
def create(dataset, window_size, feature=None, min_observations=None, verbose=True): """ Create a :class:`MovingZScoreModel` model. This model fits a moving average to a univariate time series and identifies points that are far from the fitted curve. The MovingZScoreModel works with either TimeSeries or SFrame inputs. A uniform sampling rate is assumed and the data window must be defined in terms of number of observations. This model differs from other GraphLab Create models in that it can be created from an existing `MovingZSCoreModel`. To create a new model in this fashion, use the existing model's `update` method. The model created by this function contains a table `scores` that contains the computed anomaly scores. The type of `scores` matches the type of the input `dataset`, and the table contains 5 columns: - *row id/time*: ID of the corresponding row in the input `dataset`. If `dataset` is an SFrame, this is the row numbers of the input data; if `dataset` is a TimeSeries, it is the index of the time series. - *anomaly score*: absolute value of the moving Z-score. A score of 0 indicates the value is identical to the moving average. The higher the score, the more likely a point is to be an anomaly. - *value*: input data. The name of this column matches the input `feature`. - *moving average*: moving average of each point's preceding `window_size` values. - *model update time*: time the model was updated. This is particularly useful if the `window_size` is larger than the number of rows in the input datasets, because the `scores` table has results from several updates. Parameters ---------- dataset : SFrame or TimeSeries Input data. The column named by the 'feature' parameter will be extracted for modeling. window_size : int Length of the time window to use for defining the moving z-score value, in terms of number of observations. feature : str, optional Name of the column to model. Any data provided to the model with either the `create` or `update` functions must have a column with this name. The feature name is not necessary if `dataset` is an SFrame with a single column or a TimeSeries with a single value column; it can be determined automatically in this case. min_observations : int, optional Minimum number of non-missing observations in the moving window required to compute the moving Z-score. If unspecified, the entire moving window preceding an observation must not contain any missing values in order for the observation to get an anomaly score. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : MovingZScoreModel A trained :class:`MovingZScoreModel`, which contains a table called `scores` that includes the anomaly score for each input data point. The type of the `scores` table matches the type of the input `dataset`. See Also -------- MovingZScoreModel, MovingZScoreModel.update Notes ----- - The moving Z-score for a data point :math:`x_t` is simply the value of :math:`x_t` standardized by subtracting the moving mean just prior to time :math:`t` and dividing by the moving standard deviation just prior to :math:`t`. Suppose :math:`w` stands for the `window_size` in terms of the number of observations. Then the moving Z-score is: .. math:: z(x_t) = \\frac{x_t - \\bar{x}_t}{s_t} where the moving average is: .. math:: \\bar{x}_t = (1/w) \sum_{i=t-w}^{t-1} x_i and the moving standard deviation is: .. math:: s_t = \sqrt{(1/w) \sum_{i=t-w}^{t-1} (x_i - \\bar{x}_t)^2} - The moving Z-score at points within `window_size` observations of the beginning of a series are not defined, because there are insufficient points to compute the moving average and moving standard deviation. This is represented by missing values. - Missing values in the input dataset are assigned missing values ('None') for their anomaly scores as well. - If there is no variation in the values preceding a given observation, the moving Z-score can be infinite or undefined. If the given observation is equal to the moving average, the anomaly score is coded as 'nan'; if the observation is *not* equal to the moving average, the anomaly score is 'inf'. Examples -------- >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010], ... 'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]}) >>> model = graphlab.anomaly_detection.moving_zscore.create(sf, ... window_size=3, ... feature='value') >>> model['scores'].print_rows(max_column_width=20) +--------+----------------+-------+----------------+---------------------+ | row_id | anomaly_score | value | moving_average | model_update_time | +--------+----------------+-------+----------------+---------------------+ | 0 | None | 12.2 | None | 2016-01-04 16:55... | | 1 | None | 11.7 | None | 2016-01-04 16:55... | | 2 | None | 12.5 | None | 2016-01-04 16:55... | | 3 | 28.0822407386 | 21.4 | 12.1333333333 | 2016-01-04 16:55... | | 4 | 1.00086199482 | 10.8 | 15.2 | 2016-01-04 16:55... | | 5 | 0.795990414837 | 11.2 | 14.9 | 2016-01-04 16:55... | +--------+----------------+-------+----------------+---------------------+ [6 rows x 5 columns] """ _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.moving_zscore.create') start_time = _time.time() logger = _logging.getLogger(__name__) ## Validate required inputs by themselves. if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1: raise _ToolkitError("Input 'dataset' is empty.") if not isinstance(window_size, int): raise TypeError("Input 'window_size' must be an integer.") if window_size < 1: raise ValueError("Input 'window_size' must greater than or " + "equal to 1.") if feature is not None and not isinstance(feature, str): raise TypeError("Input 'feature' must be a string if specified.") if min_observations is not None: if not isinstance(min_observations, int): raise TypeError("If specified, input 'min_observations' must " + "be a positive integer.") if min_observations < 1: raise ValueError("If specified, input 'min_observations' must " + "be a positive integer.") ## Determine the feature name if left unspecified. column_names = dataset.column_names() if isinstance(dataset, _gl.SFrame) \ else dataset.value_col_names if feature is None: if len(column_names) == 1: feature = column_names[0] else: raise _ToolkitError("If the 'input' dataset has multiple " + "columns, a 'feature' column name must be " + "specified.") ## Extract the specified feature as an SArray. try: series = dataset[feature] except: raise _ToolkitError("The specified feature could not be found " + "in the input 'dataset'.") ## Validate the type of the feature. if not series.dtype() in [int, float]: raise ValueError("The values in the specified feature must be " + "integers or floats.") ## Compute the moving average, Z-score, and a final anomaly score. For all # anomaly detectcion models, the final score should be in the range [0, # \infty], with higher values indicating more outlier-ness. moving_average, moving_zscore, sufficient_data = \ _moving_z_score(series, window_size, min_observations) anomaly_score = abs(moving_zscore) if not sufficient_data: logger.warning("The number of observations is smaller than " + "the minimum number needed to compute a " + "moving Z-score, so all anomaly scores are 'None'. " + "Consider adding more data with the model's `update` " + "method, or reducing the `window_size` or " + "`min_observations` parameters.") ## Format the results. scores = _gl.SFrame({feature: series, 'moving_average': moving_average, 'anomaly_score': anomaly_score}) scores['model_update_time'] = _dt.datetime.now() scores = scores[['anomaly_score', # reorder the columns feature, 'moving_average', 'model_update_time']] if isinstance(dataset, _gl.SFrame): if feature != 'row_id': scores = scores.add_row_number('row_id') else: logger.warning("Feature name is 'row_id', so the " + "index in the model's 'scores' SFrame " + "is called '_row_id'.") scores = scores.add_row_number('_row_id') if isinstance(dataset, _gl.TimeSeries): scores[dataset.index_col_name] = dataset[dataset.index_col_name] scores = _gl.TimeSeries(scores, index=dataset.index_col_name) dataset_type = 'TimeSeries' if isinstance(dataset, _gl.TimeSeries) else 'SFrame' ## Set up the model. state = { 'dataset_type': dataset_type, 'verbose': verbose, 'window_size': window_size, 'min_observations': min_observations, 'num_examples': len(dataset), 'feature': feature, 'training_time': _time.time() - start_time, 'scores': scores} model = MovingZScoreModel(state) return model