Example #1
0
def load_ts_data(path,
                 timestamp_col=None,
                 date_cols=None,
                 epoch_col=None,
                 set_index=False,
                 n_rows=None):
    """ path should be a full path to a csv file with exactly one of columns
        timestamp or epoch
    """
    assert isinstance(path, str)
    assert isinstance(timestamp_col, (str, type(None)))
    assert isinstance(date_cols,
                      (list, type(None)))  # columns to be parsed as dates
    if date_cols is None:
        date_cols = False
    if timestamp_col is not None:
        assert timestamp_col in date_cols, 'timestamp_col should be one of date_cols.'
    assert isinstance(epoch_col, (str, type(None)))
    assert isinstance(set_index, bool)
    assert isinstance(n_rows, (int, type(None)))

    if timestamp_col is not None:
        ts = pd.read_csv(path,
                         parse_dates=date_cols,
                         infer_datetime_format=True)
    else:
        ts = pd.read_csv(path)
    assert isinstance(ts, pd.DataFrame)
    assert ((timestamp_col is None) and (epoch_col is not None)) or \
           ((timestamp_col is not None) and (
                   epoch_col is None)), 'ts should have either \'timestamp\' or \'epoch\' columns (but not both).'

    log.debug('ts shape: {}.'.format(ts.shape))
    log.debug('ts head: {}'.format(ts.head()))

    if timestamp_col is None:
        # ts has epoch column. rename and add timestamp column
        ts.rename(columns={epoch_col: 'epoch'}, inplace=True)
        # convert to timestamp
        ts['timestamp'] = pd.to_datetime(ts['epoch'], unit='s')

    if epoch_col is None:
        # ts has timestamp column. rename and add epoch column
        ts.rename(columns={timestamp_col: 'timestamp'}, inplace=True)
        # convert to epoch
        ts['epoch'] = ts['timestamp'].astype('int64') // 1e9
        ts['epoch'] = ts['epoch'].astype('int64')

    if set_index:
        ts = ts.set_index('timestamp')
    if n_rows is not None:
        ts = ts.iloc[0:n_rows]

    return ts
    def run(self):

        log.debug('Running feature engineering ..')
        fe_start_time = time.time()

        # load data
        dir = u'D:\\FAMILY\\Yuval\\Work\\Seebo\\'
        file = u'Yuval_TS_Table.csv'
        path = dir + file
        # TODO: see parsing dates is so slow
        # date_cols = ['end_time_stamp', 'start_time', 'end_time']
        # data = pd.read_csv(path, parse_dates=date_cols, infer_datetime_format=True)
        data = pd.read_csv(path)
        sensors_per_batch = data.groupby('batch_id')['metric_id'].apply(
            lambda ts: ts.unique())
        if sensors_per_batch.shape[0] > 1:
            # TODO: validate every batch has the same sensors data. Make more elegant
            first_batch_sensors = sensors_per_batch.iloc[0]
            for i in range(1, len(sensors_per_batch.shape[0])):
                assert np.equal(first_batch_sensors, sensors_per_batch.iloc[i]), \
                    'All batches should have the same sensors'

        # TODO: sort by batch_id, metric_id, value
        data = data.sort_values(by=['batch_id', 'metric_id', 'sensor_value'],
                                inplace=False)

        # impute missing values
        # TODO: replace this stupid imputation method
        data['sensor_value'] = data['sensor_value'].fillna(0.0, inplace=False)

        # instantiate composite feature extractor
        # TODO: map self._feature_extractor_names to self._feature_extractor_objects
        gfe = GlobalFeatureExtractor(self._time_series_features_enricher)
        tfe = TemporalFeatureExtractor()
        cfe = CompositeFeatureExtractor([gfe, tfe])
        design_matrix = cfe.extract(data)

        feature_engineering_main_output = design_matrix

        fe_end_time = time.time()
        fe_duration = round((fe_end_time - fe_start_time) / 60, 2)
        log.debug(
            'Done running feature engineering [Total time: {} mins.].'.format(
                fe_duration))

        return feature_engineering_main_output
Example #3
0
    def extract(self, data):

        assert isinstance(data, pd.DataFrame)
        # assert that data have no missing values
        assert not pd.isnull(
            data).values.any(), 'data should not contain missing values.'

        log.debug('Running Global feature extractor ..')
        gfe_start_time = time.time()

        # setting time series features to extract or use default
        # fc_parameters = MinimalFCParameters()
        # fc_parameters = EfficientFCParameters()
        # fc_parameters = ComprehensiveFCParameters()

        # feature extraction
        design_matrix = extract_features(
            data,
            default_fc_parameters=self._fc_parameters,
            column_id='batch_id',
            column_sort='end_time_stamp',
            column_kind='metric_id',
            column_value='sensor_value',
            n_jobs=self._num_of_cores_to_use)

        # impute: use a builtin tsfresh method that replaces NaN with median and -inf
        # [+inf] with min [max] in a columnwise fashion (and in place)
        # If the column does not contain finite values at all, it is filled with zeros
        # Also, all columns will be guaranteed to be of type np.float64
        # (can also be done by passing impute_function=impute) to extract_features())
        impute(design_matrix)
        # TODO: assert that none cf the columns was filled with zeros

        # TODO: think about feature selection as well (extract_relevant_features), see:
        # https://github.com/blue-yonder/tsfresh/blob/master/notebooks/robot_failure_example.ipynb
        # note though that this may be problematic for real-time ts anomaly detection

        gfe_end_time = time.time()
        gfe_duration = round((gfe_end_time - gfe_start_time) / 60, 2)

        log.debug(
            'Done running Global feature extractor [Total time: {} mins.].'.
            format(gfe_duration))

        return design_matrix
    def extract(self, data):
        assert isinstance(data, pd.DataFrame)
        # assert that data have no missing values
        assert not pd.isnull(data).values.any(), 'data should not contain missing values.'

        log.debug('Running Composite feature extractor ..')
        cfe_start_time = time.time()

        matrices_lst = []
        for fe in self._feature_extractors:
            matrices_lst.append(fe.extract_features(data))

        design_matrix = pd.concat(matrices_lst, axis=1)

        cfe_end_time = time.time()
        cfe_duration = round((cfe_end_time - cfe_start_time) / 60, 2)
        log.debug('Done running Composite feature extractor [Total time: {} mins.].'.format(cfe_duration))

        return design_matrix
Example #5
0
    def extract(self, data):

        assert isinstance(data, pd.DataFrame)
        # assert that data have no missing values
        assert not pd.isnull(
            data).values.any(), 'data should not contain missing values.'

        log.debug('Running Temporal feature extractor ..')
        tfe_start_time = time.time()

        design_matrix = data.groupby(
            ['batch_id',
             'metric_id'])['end_time_stamp'].aggregate(self._timespan)

        tfe_end_time = time.time()
        tfe_duration = round((tfe_end_time - tfe_start_time) / 60, 2)
        log.debug(
            'Done running Temporal feature extractor [Total time: {} mins.].'.
            format(tfe_duration))

        return design_matrix
Example #6
0
def _validate_and_sort_data_prior_to_charting(data, anomalous_batch_id,
                                              sensor_id):
    assert not pd.isnull(
        data).any().any(), 'Data have missing values. Please check.'
    expected_columns = [
        'batch_id', 'sensor_id', 'timestamp', 'value', 'batch_label'
    ]
    assert set(data.columns) == set(expected_columns)
    s = 'batch {} has no records for sensor {}'.format(anomalous_batch_id,
                                                       sensor_id)
    assert sensor_id in data.loc[data['batch_id'] == anomalous_batch_id,
                                 'sensor_id'].unique(), s

    assert set(data['batch_label'].unique()) == set(np.array([0, 1]))

    assert (data.loc[data['batch_id'] == anomalous_batch_id,
                     'batch_label'] == 1
            ).all(), 'batch_id should be an abnormal batch.'

    normal_batches = data.loc[data['batch_label'] == 0].copy()
    number_of_normal_batches = len(normal_batches['batch_id'].unique())
    assert not normal_batches.empty, 'There are no normal batches.'
    s = 'At least one normal batch has no records for sensor: {}.'.format(
        sensor_id)
    assert normal_batches.groupby('batch_id')['sensor_id'].aggregate(
        lambda ts: sensor_id in ts.unique()).sum(
        ) == number_of_normal_batches, s

    # sort data by (batch_id, sensor_id, timestamp)
    data = data.sort_values(by=['batch_id', 'sensor_id', 'timestamp'],
                            inplace=False)

    log.debug(
        'Done validating and sorting data by (batch_id, sensor_id, timestamp) prior to charting.'
    )

    return data
Example #7
0
def generate_fake_data(n_batches=150, n_sensors=300, batch_anomalous_probability=0.3,
                       mu_0=0.0, mu_1=1.5, sd_0=1.0, sd_1=1.0):
    log.debug('Generating fake data: {} batches, {} sensors, batch anomalous probability: {}'.format(
        n_batches, n_sensors, batch_anomalous_probability))

    batch_ids = ['B-' + ''.join(random.choice('0123456789ABCDEF') for i in range(8)) for _ in range(n_batches)]
    sensor_ids = ['S-' + ''.join(random.choice(string.ascii_lowercase) for i in range(8)) for _ in range(n_sensors)]

    data = []

    for batch_id in batch_ids:

        # create target labels
        batch_label = np.random.binomial(1, batch_anomalous_probability, 1)[0]

        # create data
        for sensor_id in sensor_ids:
            hour = random.randint(10, 12)
            min = random.choice(np.arange(0, 60, 5))
            min_timestamp = pd.Timestamp(2018, 11, 1, hour, min)
            sensor_duration_in_minutes = random.choice(np.arange(300, 420, 5))  # 5 to 7 hours
            max_timestamp = min_timestamp + datetime.timedelta(minutes=float(sensor_duration_in_minutes))
            timestamps = pd.date_range(min_timestamp, max_timestamp, freq='5min')

            if batch_label == 0:
                values = np.random.normal(mu_0, sd_0, len(timestamps))
            else:
                values = np.random.normal(mu_1, sd_1, len(timestamps))

            for timestamp, value in zip(timestamps, values):
                data.append([batch_id, sensor_id, timestamp, value, batch_label])

    data = pd.DataFrame(data, columns=['batch_id', 'sensor_id', 'timestamp', 'value', 'batch_label'])

    log.debug('Done generating fake data: {} batches, {} sensors.'.format(n_batches, n_sensors))

    return data
Example #8
0
def create_anomalous_charts(data,
                            anomalous_batch_id,
                            sensor_id,
                            dir=None,
                            show=True,
                            plotly=False):
    log.debug(
        'Creating prospect/retrospect charts (Forward/Backward View) for batch {} and sensor {}.'
        .format(anomalous_batch_id, sensor_id))

    assert isinstance(data, pd.DataFrame)
    assert isinstance(anomalous_batch_id, str)
    assert isinstance(sensor_id, str)
    assert isinstance(dir, (str, type(None)))
    assert isinstance(show, bool)

    data = _validate_and_sort_data_prior_to_charting(data, anomalous_batch_id,
                                                     sensor_id)

    data_for_chart = _prepare_data_for_chart(data, anomalous_batch_id,
                                             sensor_id)

    batch_values = data_for_chart.get('batch_values')
    # forward view
    batch_duration_in_minutes_forward_view = data_for_chart.get(
        'batch_duration_in_minutes_forward_view')
    normal_batches_duration_in_minutes_forward_view = data_for_chart.get(
        'normal_batches_duration_in_minutes_forward_view')
    normal_batches_averages_forward_view = data_for_chart.get(
        'normal_batches_averages_forward_view')
    normal_batches_lower_values_forward_view = data_for_chart.get(
        'normal_batches_lower_values_forward_view')
    normal_batches_upper_values_forward_view = data_for_chart.get(
        'normal_batches_upper_values_forward_view')

    # backward view
    batch_duration_in_minutes_backward_view = data_for_chart.get(
        'batch_duration_in_minutes_backward_view')
    normal_batches_duration_in_minutes_backward_view = data_for_chart.get(
        'normal_batches_duration_in_minutes_backward_view')
    normal_batches_averages_backward_view = data_for_chart.get(
        'normal_batches_averages_backward_view')
    normal_batches_lower_values_backward_view = data_for_chart.get(
        'normal_batches_lower_values_backward_view')
    normal_batches_upper_values_backward_view = data_for_chart.get(
        'normal_batches_upper_values_backward_view')

    fig, ax = plt.subplots(2, 1)

    ax[0].plot(batch_duration_in_minutes_forward_view,
               batch_values,
               marker='',
               color='red',
               label='Batch id: {}'.format(anomalous_batch_id))
    ax[0].plot(normal_batches_duration_in_minutes_forward_view,
               normal_batches_averages_forward_view,
               marker='',
               color='green',
               linewidth=3,
               label='Normal Batches (avg.)')
    ax[0].fill_between(normal_batches_duration_in_minutes_forward_view,
                       normal_batches_lower_values_forward_view,
                       normal_batches_upper_values_forward_view,
                       color='lightgreen',
                       alpha='0.2')
    # ax[0].title('Prospect (Forward) View: Sensor id {}'.format(sensor_id), fontsize=12)
    ax[0].set_title('Prospect (Forward) View', size=12)
    # ax[0].xlabel('Minutes (since start)')
    ax[0].set_xlabel('Minutes (since start)')
    ax[0].legend()

    ax[1].plot(batch_duration_in_minutes_backward_view,
               batch_values,
               marker='',
               color='red',
               label='Batch id: {}'.format(anomalous_batch_id))
    ax[1].plot(normal_batches_duration_in_minutes_backward_view,
               normal_batches_averages_backward_view,
               marker='',
               color='green',
               linewidth=3,
               label='Normal Batches (avg.)')
    ax[1].fill_between(normal_batches_duration_in_minutes_backward_view,
                       normal_batches_lower_values_backward_view,
                       normal_batches_upper_values_backward_view,
                       color='lightgreen',
                       alpha='0.2')
    # ax[1].title('Retrospect (Backward) View: Sensor id {}'.format(sensor_id), fontsize=12)
    ax[1].set_title('Retrospect (Backward) View', size=12)
    # ax[1].xlabel('Minutes (prior to end)')
    ax[1].set_xlabel('Minutes (prior to end)')
    ax[1].legend()

    fig.suptitle('Anomaly Charts for batch id: {} and sensor id: {}'.format(
        anomalous_batch_id, sensor_id),
                 size=15)

    if show:
        fig.show()

    if dir is not None:
        file_name = 'anomaly_chart_' + 'batch_id_' + anomalous_batch_id + 'sensor_id_' + sensor_id + '.pdf'
        full_path = dir + file_name

        fig.set_size_inches(10, 10)
        fig.savefig(full_path, dpi=100)

    if plotly:
        abnormal_batch = go.Scatter(x=batch_duration_in_minutes_forward_view,
                                    y=batch_values,
                                    name='Abnormal Batch',
                                    mode='lines+markers',
                                    line=dict(color='red'))
        normal_batches_average = go.Scatter(
            x=normal_batches_duration_in_minutes_forward_view,
            y=normal_batches_averages_forward_view,
            name='Normal Batches',
            line=dict(color='green', width=4))
        normal_batches_lower = go.Scatter(
            x=normal_batches_duration_in_minutes_forward_view,
            y=normal_batches_lower_values_forward_view,
            name='lower',
            hoverinfo='skip',
            fill=None,
            mode='lines',
            line=dict(color='lightgreen'),
            showlegend=False)
        normal_batches_upper = go.Scatter(
            x=normal_batches_duration_in_minutes_forward_view,
            y=normal_batches_upper_values_forward_view,
            name='upper',
            hoverinfo='skip',
            fill='tonexty',
            #fillcolor='lightgreen',
            mode='lines',
            line=dict(color='lightgreen'),
            showlegend=False)

        data = [
            normal_batches_lower, normal_batches_upper, abnormal_batch,
            normal_batches_average
        ]
        layout = dict(title='Prospect')

        fig = dict(data=data, layout=layout)
        plot(fig)

    log.debug(
        'Done creating prospect/retrospect charts (Forward/Backward View) for batch {} and sensor {}.'
        .format(anomalous_batch_id, sensor_id))
Example #9
0
def plot_ts_and_anomalies(ts,
                          value_col,
                          anomalies,
                          anomaly_scores,
                          ts_only=False,
                          dir=None,
                          show=True,
                          plotly=False):
    assert isinstance(ts, pd.DataFrame)
    assert isinstance(value_col, str)
    assert not pd.isnull(ts[value_col]).any(), 'value_col has missing data'
    assert isinstance(anomalies, list)
    assert isinstance(anomaly_scores, TimeSeries)
    assert isinstance(ts_only, bool)
    assert isinstance(dir, (str, type(None)))
    if dir is not None:
        assert dir[-1] == '/'
    if not os.path.exists(dir):
        os.makedirs(dir)
    assert isinstance(show, bool)
    assert isinstance(plotly, bool)

    assert len(ts['timestamp'].unique()
               ) == ts.shape[0], 'timestamp should not have duplicated values'

    if (len(anomalies) == 0) or ts_only:
        # plot ts only
        if len(anomalies) == 0:
            log.debug('Found no anomalies.')
        plt.plot_date(ts['timestamp'], ts[value_col], color='blue', fmt='-')
        plt.title('ts', size=12)

        if show:
            plt.show()

        # TODO: add plotly plot in this case as well

    else:
        # plot ts and anomalies
        log.debug('Found {} anomalies.'.format(len(anomalies)))

        scores = anomaly_scores.values

        fig, ax = plt.subplots(2, 1)

        # plot ts
        # ax[0].plot(ts['epoch'], ts[value_col], color='blue')
        ax[0].plot_date(ts['timestamp'], ts[value_col], color='blue', fmt='-')
        ax[0].set_title('ts', size=12)

        # plot anomalies on top of ts
        for anomaly in anomalies:
            anomay_time_window = anomaly.get_time_window()
            epoch_left = anomay_time_window[0]
            epoch_right = anomay_time_window[1]
            timestamp_left = ts.loc[ts['epoch'] == epoch_left,
                                    'timestamp'].values[0]
            timestamp_right = ts.loc[ts['epoch'] == epoch_right,
                                     'timestamp'].values[0]
            ax[0].axvspan(timestamp_left,
                          timestamp_right,
                          alpha=0.5,
                          color='gray')

        # plot anomaly scores
        # ax[1].plot(ts['epoch'], scores, color='red')
        ax[1].plot_date(ts['timestamp'], scores, color='red', fmt='-')
        ax[1].set_title('scores', size=12)

        if show:
            fig.show()

        if dir is not None:
            file_name = 'ts_and_anomaly_scores.pdf'
            full_path = dir + file_name

            fig.set_size_inches(10, 10)
            fig.savefig(full_path, dpi=100)

        if plotly:
            if dir is not None:
                file_name = 'ts_and_anomaly_scores.html'
                full_path = dir + file_name
                time_series = go.Scatter(x=ts['timestamp'],
                                         y=ts[value_col],
                                         name='ts',
                                         mode='lines',
                                         line=dict(color='blue'))
                anomaly_scores = go.Scatter(x=ts['timestamp'],
                                            y=scores,
                                            name='scores',
                                            line=dict(color='red'))

                fig = tools.make_subplots(rows=2,
                                          cols=1,
                                          specs=[[{}], [{}]],
                                          shared_xaxes=True,
                                          shared_yaxes=False)

                fig.append_trace(time_series, 1, 1)
                fig.append_trace(anomaly_scores, 2, 1)

                # fig['layout'].update(height=600, width=800, title='Time series and anomaly scores')
                fig['layout'].update(title='Time series and anomaly scores')
                plot(fig, filename=full_path)
            else:
                log.debug(
                    'Need to supply a dir in order to generate plotly chart.')
Example #10
0
import pandas as pd
import csv

from src.utils.logger import log

pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.max_rows', None, 'display.max_columns', None)

change_timestamp_format = False

if change_timestamp_format:
    log.debug('Loading single_batch data ..')
    date_parser = lambda x: pd.datetime.strptime(x, '%d/%m/%y %H:%M')
    data = pd.read_csv('/Users/yuval/Downloads/Sensor_readings.csv',
                       parse_dates=['end_time_stamp'],
                       infer_datetime_format=True,
                       date_parser=date_parser)
    log.debug('Done loading single_batch data.')

    date_format = u'%Y-%m-%d'
    timestamp_format = u'%Y-%m-%d %H:%M:%S'

    log.debug('Persisting to csv new timestamp format ..')
    data.to_csv('/Users/yuval/Downloads/Sensor_readings_YUVAL.csv',
                index=False,
                quoting=csv.QUOTE_ALL,
                doublequote=True,
                date_format=timestamp_format)
    log.debug('Done persisting to csv new timestamp format.')
else:
    log.debug('Loading single_batch data ..')