def __init__(self, subject, targets, time_cols, source_db, result_db):
        self.subject = subject
        self.targets = targets
        self.time_cols = time_cols

        self.source_db = DataBase(source_db)
        self.result_db = DataBase(result_db)

        self.scaler = StandardScaler()

        self.modelers = {}
        self.model_path = './models/clust/'
def upload_simple_data():
    db = DataBase('BaseDB.db')
    res = {}

    for sub in subjects:
        sub_df = db.read(sub + '_source').get_data()[-150:]
        #print(sub_df.columns)
        for target in targets:
            res[sub + '_' + target] = sub_df[['Время', target]]
            res[sub + '_' + target].columns = ['time', 'actual']

            #print(res[sub+'_'+target])

    return res
def upload_clust_data():
    db = DataBase('BaseDB.db')
    res = {}

    for sub in subjects:
        #Необходимо отрезать первые 10,
        res[sub] = db.read('Analitics_' + 'clust_' + sub).get_data()[-150:]
        print('Clust ', sub)
        print("@@@")
        #print(res[sub][-100:-50])
        print(res[sub][:50])
        print('|||')
        print(res[sub][-50:])

    return res
Exemple #4
0
    def __init__(self, subject, targets, time_cols, source_db, result_db,
                 n_predict, volume, interval):
        self.subject = subject
        self.targets = targets
        self.time_cols = time_cols

        self.n_predict = n_predict
        self.volume = volume
        self.interval = interval

        self.source_db = DataBase(source_db)
        self.result_db = DataBase(result_db)

        self.modelers = {}

        self.models_path = './models/separated/model_' + self.subject + '_'
def upload_separate_data():
    db = DataBase('BaseDB.db')
    res = {}

    for sub in subjects:
        for target in targets:
            if (db.get_table_length('Analitics_' + 'sep_' + sub + '_' +
                                    target) == -1):
                print('Table ' + 'Analitics_' + 'sep_' + sub + '_' + target +
                      ' doen`t exists yet!')
            else:
                res[sub + '_' +
                    target] = db.read('Analitics_' + 'sep_' + sub + '_' +
                                      target).get_data()[-150:]
                print(res[sub + '_' + target])

    return res
Exemple #6
0
    def __init__(self, destination_db, use_cols, data_path, time_col, cols_to_average, entities_to_use, group_by, avg_window='300S',
                        decode_UNIX_time=False, prepare_time=False, trig_prepare_time=False, mode='load', from_outer=False, time_to_proc=None):

        self.destination_db = DataBase(destination_db)
        self.use_cols = use_cols
        self.data_path = data_path
        self.from_outer = from_outer
        self.time_to_proc = time_to_proc
        self.mode = mode

        self.time_col = time_col
        self.cols_to_average = cols_to_average
        self.group_by = group_by
        self.avg_window = avg_window

        self.entities_to_use = entities_to_use

        self.decode_UNIX_time = decode_UNIX_time
        self.prepare_time = prepare_time
        self.trig_prepare_time = trig_prepare_time

        self.data = pd.DataFrame()
Exemple #7
0
class CoreSeparated:
    def __init__(self, subject, targets, time_cols, source_db, result_db,
                 n_predict, volume, interval):
        self.subject = subject
        self.targets = targets
        self.time_cols = time_cols

        self.n_predict = n_predict
        self.volume = volume
        self.interval = interval

        self.source_db = DataBase(source_db)
        self.result_db = DataBase(result_db)

        self.modelers = {}

        self.models_path = './models/separated/model_' + self.subject + '_'

    def fit_core(self, first=False):
        data = self.source_db.read(self.subject + '_source').get_data()
        print(data.iloc[:, :])
        for target in self.targets:
            modeler = Modeler(model_path=self.models_path + target + '.txt',
                              n_predict=self.n_predict,
                              volume=self.volume,
                              scaler='std',
                              drop_null=True)

            print(data[target])
            print(data[self.time_cols + ['Время']])
            X, y = modeler.prepare_data(X=data[self.time_cols + ['Время']],
                                        y=data[target],
                                        predict_phase=True)

            n_folds = 6
            if len(X) <= self.n_predict + n_folds:
                print(
                    'Warning: Model wasn`t trained! Not enought clean data ' +
                    target)
                continue

            # Нужно для понимания, когда данные пришли
            time_col = pd.to_datetime(X['Время'])

            first_start = os.path.isfile(self.models_path + target + '.txt')

            X.drop(['Время'], axis=1, inplace=True)
            modeler.fit(X.iloc[:-self.n_predict, :], y[:-self.n_predict])
            self.modelers[self.subject + '_' + target] = modeler

            if not first_start:
                predictions, lower, upper = modeler.predict(X, interval=True)

                y_anom_search = y[:-self.n_predict].copy()

                anomalies = CoreSeparated.find_anomalies(
                    y_anom_search, lower[:-self.n_predict],
                    upper[:-self.n_predict])
                anom_dummy = np.full((self.n_predict, ), np.NaN)

                anomalies_res = np.concatenate([anomalies, anom_dummy])
                y_res = np.concatenate([y_anom_search, anom_dummy])

                # Интерполяция времени на предсказанные периоды
                time_dummy = pd.date_range(
                    start=time_col.iloc[-self.n_predict - 1],
                    periods=self.n_predict + 1,
                    closed='right',
                    freq=self.interval)
                time_col = pd.concat(
                    [time_col.iloc[:-self.n_predict],
                     pd.Series(time_dummy)],
                    axis=0).astype(str)

                # Добавим первые значения показателя без предсказаний
                time_first = data['Время'][:self.volume]
                y_first = data[target][:self.volume]
                dummy_first = np.array(
                    [None for counter in range(self.volume)])

                time_col = pd.concat([pd.Series(time_first), time_col],
                                     axis=0).astype(str)
                y_res = np.concatenate([y_first, y_res], axis=0)
                predictions = np.concatenate([y_first, predictions], axis=0)
                lower = np.concatenate([dummy_first, lower], axis=0)
                upper = np.concatenate([dummy_first, upper], axis=0)
                anomalies_res = np.concatenate([dummy_first, anomalies_res],
                                               axis=0)
                ##

                df_result = pd.DataFrame(np.vstack([
                    time_col, y_res, predictions, lower, upper, anomalies_res
                ]).T,
                                         columns=[
                                             'time', 'actual', 'predictions',
                                             'lower', 'upper', 'anomalies'
                                         ])
                self.result_db.write_from_df(Table(
                    'Analitics_' + 'sep_' + self.subject + '_' + target,
                    df_result),
                                             method='replace')

    @staticmethod
    def find_anomalies(y_true, lower, upper):
        anomalies = np.array([np.NaN] * len(y_true))
        anomalies[y_true < lower] = y_true[y_true < lower]
        anomalies[y_true > upper] = y_true[y_true > upper]

        return anomalies

    def load_models(self):
        """
        ! load None if model doesn`t exists
        """
        for target in self.targets:
            if os.path.isfile(self.models_path + target + '.txt'):
                modeler = Modeler(model_path=self.models_path + target +
                                  '.txt',
                                  n_predict=self.n_predict,
                                  volume=self.volume,
                                  scaler='std',
                                  drop_null=True)
                modeler.load_model_params()
            else:
                #if model doesn`t exists yet
                modeler = None

            self.modelers[self.subject + '_' + target] = modeler

    def make_predictions(self):
        for target in self.targets:
            modeler = self.modelers[self.subject + '_' + target]
            if modeler is None:
                #model doesn`t exists yet
                continue

            volume = modeler.get_volume()

            batch_volume = 1
            data = self.source_db.read(self.subject + '_source',
                                       volume=volume +
                                       self.n_predict).get_data()

            X, y = modeler.prepare_data(X=data[self.time_cols + ['Время']],
                                        y=data[target],
                                        predict_phase=True,
                                        drop_null=False)

            time = pd.Series(pd.to_datetime(X['Время']))

            start_time = time.iloc[-self.n_predict - batch_volume]
            time = pd.Series(
                pd.date_range(start=start_time,
                              periods=self.n_predict + batch_volume,
                              closed='left',
                              freq=self.interval))
            time = time.astype(str)

            X.drop(['Время'], axis=1, inplace=True)

            predictions, lower, upper = modeler.predict(X, interval=True)

            anomalies = CoreSeparated.find_anomalies(y, lower, upper)

            #send data to outer messager

            actual = data[target][-batch_volume:]
            actual_dummy = np.array(
                [np.NaN for i in range(len(predictions) - batch_volume)])
            actual = np.concatenate([actual, actual_dummy])

            df_result = pd.DataFrame(np.vstack(
                [time, actual, predictions, lower, upper, anomalies]).T,
                                     columns=[
                                         'time', 'actual', 'predictions',
                                         'lower', 'upper', 'anomalies'
                                     ])
            self.result_db.write_from_df(Table(
                'Analitics_' + 'sep_' + self.subject + '_' + target,
                df_result),
                                         method='replace last')
Exemple #8
0
class DataPreparator:
    def __init__(self, destination_db, use_cols, data_path, time_col, cols_to_average, entities_to_use, group_by, avg_window='300S',
                        decode_UNIX_time=False, prepare_time=False, trig_prepare_time=False, mode='load', from_outer=False, time_to_proc=None):

        self.destination_db = DataBase(destination_db)
        self.use_cols = use_cols
        self.data_path = data_path
        self.from_outer = from_outer
        self.time_to_proc = time_to_proc
        self.mode = mode

        self.time_col = time_col
        self.cols_to_average = cols_to_average
        self.group_by = group_by
        self.avg_window = avg_window

        self.entities_to_use = entities_to_use

        self.decode_UNIX_time = decode_UNIX_time
        self.prepare_time = prepare_time
        self.trig_prepare_time = trig_prepare_time

        self.data = pd.DataFrame()

    def upload_data(self):
        if self.from_outer:
            #Здесь запуск скрипта загрузки данных извне (например чтение из Kafka)
            #df = pd.read_csv('./Source data/'+self.data_path, sep=',', usecols=self.use_cols)
            kafka_conn = KafkaConnector()
            df = kafka_conn.load_data()
        else:
            df = self.destination_db.read(self.data_path).get_data()

        if len(df) == 0:
            df = pd.DataFrame([], columns=self.use_cols)
            #self.data = df
            #return

        if self.decode_UNIX_time:
            # приводим время к читаемому формату:
            df[self.time_col] = df[self.time_col].apply(lambda x: datetime.fromtimestamp(x))
        else:
            df[self.time_col] = pd.to_datetime(df[self.time_col])

        if self.time_to_proc and self.mode == 'operate':
            delta = timedelta(seconds=int(self.avg_window[:-1]))
            thr_low = self.time_to_proc-delta <= df[self.time_col]
            thr_up = self.time_to_proc > df[self.time_col]
            print('CURRENT TIME')
            print(self.time_to_proc)

            df = df[thr_low & thr_up]
            # Здесь удалить все то, что было раньше из источника, если необходимо

        self.data = df

    def preprocess_data(self):
        """
        df - исходные данные (только с нужными колонками)
        time_col - колонка времени
        cols_to_average - Колонки, для которых надо посчитать среднее значение по периоду (для остальных - сумма по периоду)
        entities_to_use - Значения колонки источников, необходимые для рассмотрения
        group_by - Колонка с источниками
        avg_window - Период, по который нужно сужать данные
        decode_UNIX_time - НАдо ли декодировать время из формата времени UNIX
        prepare_time - Нужно ли добавлять признаки час, день недели, выходной день
        trig_prepare_time - Кодировка времени по sin/cos


        return
        - dict - {источник:данные}
        - list - название колонок временных признаков
        """
        self.upload_data()
        df = self.data

        # Добавим колонку с "1", чтобы после усреднения знать количество соединений за каждый период времени
        df['ConnectionCount'] = np.ones((len(df), 1), dtype=int)

        # Словарь для записи результатов
        result = {}

        for entity in self.entities_to_use:
            print(df[df[self.group_by] == entity])
            df_res = df[df[self.group_by] == entity]
            print(self.group_by)
            print(entity)
            print(df)
            if (len(df_res) == 0):
                
                if not (self.time_to_proc):
                    #self.time_to_proc = pd.to_datetime('2018-08-31 15:30:00')
                    raise Exception('Start up error: ', 'start system without data in buffer and without specifying time_to_proc')

                a = np.zeros(shape=(1, len(df.columns)), dtype=int)
                df_res = pd.DataFrame(a, columns=df_res.columns, index=[0])
                df_res.loc[0, self.time_col] = pd.to_datetime(self.time_to_proc) - timedelta(seconds=
                                                                                             int(self.avg_window[:-1]))
                df_res.drop([self.group_by], axis=1, inplace=True)
                print(df_res[[self.time_col]])

            df_res = df_res.resample(self.avg_window, on=self.time_col).sum()


            df_res[self.time_col] = df_res.index
            df_res.reset_index(drop=True, inplace=True)

            if self.prepare_time or self.trig_prepare_time:
                # datetime features
                df_res["hour"] = df_res[self.time_col].apply(lambda x: x.hour)
                df_res["weekday"] = df_res[self.time_col].apply(lambda x: x.weekday())
                df_res['is_weekend'] = df_res[self.time_col].apply(lambda x: x.weekday()).isin([5, 6]) * 1

                if self.trig_prepare_time:
                    hours_in_day = 24
                    weekdays_in_week = 7

                    df_res['sin_hour'] = np.sin(2 * np.pi * df_res["hour"] / hours_in_day)
                    df_res['cos_hour'] = np.cos(2 * np.pi * df_res["hour"] / hours_in_day)

                    df_res['sin_weekday'] = np.sin(2 * np.pi * df_res["weekday"] / weekdays_in_week)
                    df_res['cos_weekday'] = np.cos(2 * np.pi * df_res["weekday"] / weekdays_in_week)

                    df_res.drop(['hour'], axis=1, inplace=True)
                    df_res.drop(['weekday'], axis=1, inplace=True)

            for col in self.cols_to_average:
                print('!!!!!!!')
                print(df_res['ConnectionCount'])
                df_res[col] = (df_res[col] / df_res['ConnectionCount'])
                df_res[[col]] = df_res[[col]].fillna(value=0)
               
            result[entity] = df_res

        if self.trig_prepare_time:
            result_time_cols = [self.time_col, 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday', 'is_weekend']
        elif self.prepare_time:
            result_time_cols = [self.time_col, 'hour', 'weekday', 'is_weekend']
        else:
            result_time_cols = [self.time_col]

        return result, result_time_cols

    def load_data_to_db(self, result_dict):
        for subject in list(result_dict.keys()):
            print(result_dict[subject])
            self.destination_db.write_from_df(Table(subject+'_source',
                                                    result_dict[subject]),
                                              method='append')
class CoreClust:
    def __init__(self, subject, targets, time_cols, source_db, result_db):
        self.subject = subject
        self.targets = targets
        self.time_cols = time_cols

        self.source_db = DataBase(source_db)
        self.result_db = DataBase(result_db)

        self.scaler = StandardScaler()

        self.modelers = {}
        self.model_path = './models/clust/'

    def fit_core(self, first=False):
        data = self.source_db.read(self.subject + '_source').get_data()
        time_col = data['Время']
        data = data[self.targets]

        data_scaled = self.scaler.fit_transform(data)

        isolation_forest = IsolationForest(n_estimators=160,
                                           contamination=0.05,
                                           max_features=1.0,
                                           bootstrap=True,
                                           random_state=42)
        isolation_forest.fit(data_scaled)
        self.modelers['IsolationForest'] = isolation_forest
        self.save_models()

        if first:
            isolation_outliers = isolation_forest.predict(data_scaled)
            isolation_outliers = np.array(
                [1 if label == -1 else 0 for label in isolation_outliers])

            df_result = pd.DataFrame(np.vstack([time_col,
                                                isolation_outliers]).T,
                                     columns=['time', 'outliers'])

            self.result_db.write_from_df(Table(
                'Analitics_' + 'clust_' + self.subject, df_result),
                                         method='replace')

    def load_models(self):
        if isfile(self.model_path + 'model_' + self.subject +
                  '_IsolationForest' + '.txt'):
            with open(
                    self.model_path + 'model_' + self.subject +
                    '_IsolationForest' + '.txt', 'rb') as file:
                self.modelers['IsolationForest'] = pickle.load(file)

        with open(
                self.model_path + 'model_' + self.subject +
                '_IsolationForest' + '_params.txt', 'rb') as file:
            params = pickle.load(file)

        self.scaler = params['scaler']

    def save_models(self):
        with open(
                self.model_path + 'model_' + self.subject +
                '_IsolationForest' + '.txt', 'wb') as file:
            pickle.dump(self.modelers['IsolationForest'], file)

        params = {}
        params['scaler'] = self.scaler

        with open(
                self.model_path + 'model_' + self.subject +
                '_IsolationForest' + '_params.txt', 'wb') as file:
            pickle.dump(params, file)

    def make_predictions(self):
        isolation_forest = self.modelers['IsolationForest']

        data = self.source_db.read(self.subject + '_source',
                                   volume=1).get_data()
        time_col = data['Время']
        data = data[self.targets]

        data_scaled = self.scaler.transform(data)
        isolation_outliers = isolation_forest.predict(data_scaled)

        isolation_outliers = np.array(
            [1 if label == -1 else 0 for label in isolation_outliers])

        df_result = pd.DataFrame(np.vstack([time_col, isolation_outliers]).T,
                                 columns=['time', 'outliers'])

        self.result_db.write_from_df(Table(
            'Analitics_' + 'clust_' + self.subject, df_result),
                                     method='append')